kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  */
  18
  19 #include <linux/mm.h>
  20 #include <linux/nmi.h>
  21 #include <linux/init.h>
  22 #include <asm/uaccess.h>
  23 #include <linux/highmem.h>
  24 #include <linux/smp_lock.h>
  25 #include <asm/mmu_context.h>
  26 #include <linux/interrupt.h>
  27 #include <linux/completion.h>
  28 #include <linux/kernel_stat.h>
  29 #include <linux/security.h>
  30 #include <linux/notifier.h>
  31 #include <linux/blkdev.h>
  32 #include <linux/delay.h>
  33 #include <linux/timer.h>
  34 #include <linux/rcupdate.h>
  35 #include <linux/cpu.h>
  36 #include <linux/percpu.h>
  37
  38 #ifdef CONFIG_NUMA
  39 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
  40 #else
  41 #define cpu_to_node_mask(cpu) (cpu_online_map)
  42 #endif
  43
  44 /*
  45  * Convert user-nice values [ -20 ... 0 ... 19 ]
  46  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  47  * and back.
  48  */
  49 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  50 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  51 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  52
  53 /*
  54  * 'User priority' is the nice value converted to something we
  55  * can work with better when scaling various scheduler parameters,
  56  * it's a [ 0 ... 39 ] range.
  57  */
  58 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  59 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  60 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  61
  62 /*
  63  * These are the 'tuning knobs' of the scheduler:
  64  *
  65  * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
  66  * maximum timeslice is 200 msecs. Timeslices get refilled after
  67  * they expire.
  68  */
  69 #define MIN_TIMESLICE           ( 10 * HZ / 1000)
  70 #define MAX_TIMESLICE           (200 * HZ / 1000)
  71 #define CHILD_PENALTY           50
  72 #define PARENT_PENALTY          100
  73 #define EXIT_WEIGHT             3
  74 #define PRIO_BONUS_RATIO        25
  75 #define INTERACTIVE_DELTA       2
  76 #define MAX_SLEEP_AVG           (10*HZ)
  77 #define STARVATION_LIMIT        (10*HZ)
  78 #define NODE_THRESHOLD          125
  79
  80 /*
  81  * If a task is 'interactive' then we reinsert it in the active
  82  * array after it has expired its current timeslice. (it will not
  83  * continue to run immediately, it will still roundrobin with
  84  * other interactive tasks.)
  85  *
  86  * This part scales the interactivity limit depending on niceness.
  87  *
  88  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
  89  * Here are a few examples of different nice levels:
  90  *
  91  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
  92  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
  93  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
  94  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
  95  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
  96  *
  97  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
  98  *  priority range a task can explore, a value of '1' means the
  99  *  task is rated interactive.)
 100  *
 101  * Ie. nice +19 tasks can never get 'interactive' enough to be
 102  * reinserted into the active array. And only heavily CPU-hog nice -20
 103  * tasks will be expired. Default nice 0 tasks are somewhere between,
 104  * it takes some effort for them to get interactive, but it's not
 105  * too hard.
 106  */
 107
 108 #define SCALE(v1,v1_max,v2_max) \
 109         (v1) * (v2_max) / (v1_max)
 110
 111 #define DELTA(p) \
 112         (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
 113                 INTERACTIVE_DELTA)
 114
 115 #define TASK_INTERACTIVE(p) \
 116         ((p)->prio <= (p)->static_prio - DELTA(p))
 117
 118 /*
 119  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
 120  * to time slice values.
 121  *
 122  * The higher a thread's priority, the bigger timeslices
 123  * it gets during one round of execution. But even the lowest
 124  * priority thread gets MIN_TIMESLICE worth of execution time.
 125  *
 126  * task_timeslice() is the interface that is used by the scheduler.
 127  */
 128
 129 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
 130         ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
 131
 132 static inline unsigned int task_timeslice(task_t *p)
 133 {
 134         return BASE_TIMESLICE(p);
 135 }
 136
 137 /*
 138  * These are the runqueue data structures:
 139  */
 140
 141 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 142
 143 typedef struct runqueue runqueue_t;
 144
 145 struct prio_array {
 146         int nr_active;
 147         unsigned long bitmap[BITMAP_SIZE];
 148         struct list_head queue[MAX_PRIO];
 149 };
 150
 151 /*
 152  * This is the main, per-CPU runqueue data structure.
 153  *
 154  * Locking rule: those places that want to lock multiple runqueues
 155  * (such as the load balancing or the thread migration code), lock
 156  * acquire operations must be ordered by ascending &runqueue.
 157  */
 158 struct runqueue {
 159         spinlock_t lock;
 160         unsigned long nr_running, nr_switches, expired_timestamp,
 161                         nr_uninterruptible;
 162         task_t *curr, *idle;
 163         struct mm_struct *prev_mm;
 164         prio_array_t *active, *expired, arrays[2];
 165         int prev_cpu_load[NR_CPUS];
 166 #ifdef CONFIG_NUMA
 167         atomic_t *node_nr_running;
 168         int prev_node_load[MAX_NUMNODES];
 169 #endif
 170         task_t *migration_thread;
 171         struct list_head migration_queue;
 172
 173         atomic_t nr_iowait;
 174 };
 175
 176 static DEFINE_PER_CPU(struct runqueue, runqueues);
 177
 178 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 179 #define this_rq()               (&__get_cpu_var(runqueues))
 180 #define task_rq(p)              cpu_rq(task_cpu(p))
 181 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 182 #define rt_task(p)              ((p)->prio < MAX_RT_PRIO)
 183
 184 /*
 185  * Default context-switch locking:
 186  */
 187 #ifndef prepare_arch_switch
 188 # define prepare_arch_switch(rq, next)  do { } while(0)
 189 # define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
 190 # define task_running(rq, p)            ((rq)->curr == (p))
 191 #endif
 192
 193 #ifdef CONFIG_NUMA
 194
 195 /*
 196  * Keep track of running tasks.
 197  */
 198
 199 static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
 200         {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
 201
 202 static inline void nr_running_init(struct runqueue *rq)
 203 {
 204         rq->node_nr_running = &node_nr_running[0];
 205 }
 206
 207 static inline void nr_running_inc(runqueue_t *rq)
 208 {
 209         atomic_inc(rq->node_nr_running);
 210         rq->nr_running++;
 211 }
 212
 213 static inline void nr_running_dec(runqueue_t *rq)
 214 {
 215         atomic_dec(rq->node_nr_running);
 216         rq->nr_running--;
 217 }
 218
 219 __init void node_nr_running_init(void)
 220 {
 221         int i;
 222
 223         for (i = 0; i < NR_CPUS; i++) {
 224                 if (cpu_possible(i))
 225                         cpu_rq(i)->node_nr_running =
 226                                 &node_nr_running[cpu_to_node(i)];
 227         }
 228 }
 229
 230 #else /* !CONFIG_NUMA */
 231
 232 # define nr_running_init(rq)   do { } while (0)
 233 # define nr_running_inc(rq)    do { (rq)->nr_running++; } while (0)
 234 # define nr_running_dec(rq)    do { (rq)->nr_running--; } while (0)
 235
 236 #endif /* CONFIG_NUMA */
 237
 238 /*
 239  * task_rq_lock - lock the runqueue a given task resides on and disable
 240  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 241  * explicitly disabling preemption.
 242  */
 243 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 244 {
 245         struct runqueue *rq;
 246
 247 repeat_lock_task:
 248         local_irq_save(*flags);
 249         rq = task_rq(p);
 250         spin_lock(&rq->lock);
 251         if (unlikely(rq != task_rq(p))) {
 252                 spin_unlock_irqrestore(&rq->lock, *flags);
 253                 goto repeat_lock_task;
 254         }
 255         return rq;
 256 }
 257
 258 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 259 {
 260         spin_unlock_irqrestore(&rq->lock, *flags);
 261 }
 262
 263 /*
 264  * rq_lock - lock a given runqueue and disable interrupts.
 265  */
 266 static inline runqueue_t *this_rq_lock(void)
 267 {
 268         runqueue_t *rq;
 269
 270         local_irq_disable();
 271         rq = this_rq();
 272         spin_lock(&rq->lock);
 273
 274         return rq;
 275 }
 276
 277 static inline void rq_unlock(runqueue_t *rq)
 278 {
 279         spin_unlock_irq(&rq->lock);
 280 }
 281
 282 /*
 283  * Adding/removing a task to/from a priority array:
 284  */
 285 static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
 286 {
 287         array->nr_active--;
 288         list_del(&p->run_list);
 289         if (list_empty(array->queue + p->prio))
 290                 __clear_bit(p->prio, array->bitmap);
 291 }
 292
 293 static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
 294 {
 295         list_add_tail(&p->run_list, array->queue + p->prio);
 296         __set_bit(p->prio, array->bitmap);
 297         array->nr_active++;
 298         p->array = array;
 299 }
 300
 301 /*
 302  * effective_prio - return the priority that is based on the static
 303  * priority but is modified by bonuses/penalties.
 304  *
 305  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
 306  * into the -5 ... 0 ... +5 bonus/penalty range.
 307  *
 308  * We use 25% of the full 0...39 priority range so that:
 309  *
 310  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
 311  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
 312  *
 313  * Both properties are important to certain workloads.
 314  */
 315 static int effective_prio(task_t *p)
 316 {
 317         int bonus, prio;
 318
 319         if (rt_task(p))
 320                 return p->prio;
 321
 322         bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
 323                         MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
 324
 325         prio = p->static_prio - bonus;
 326         if (prio < MAX_RT_PRIO)
 327                 prio = MAX_RT_PRIO;
 328         if (prio > MAX_PRIO-1)
 329                 prio = MAX_PRIO-1;
 330         return prio;
 331 }
 332
 333 /*
 334  * __activate_task - move a task to the runqueue.
 335  */
 336 static inline void __activate_task(task_t *p, runqueue_t *rq)
 337 {
 338         enqueue_task(p, rq->active);
 339         nr_running_inc(rq);
 340 }
 341
 342 /*
 343  * activate_task - move a task to the runqueue and do priority recalculation
 344  *
 345  * Update all the scheduling statistics stuff. (sleep average
 346  * calculation, priority modifiers, etc.)
 347  */
 348 static inline void activate_task(task_t *p, runqueue_t *rq)
 349 {
 350         long sleep_time = jiffies - p->last_run - 1;
 351
 352         if (sleep_time > 0) {
 353                 int sleep_avg;
 354
 355                 /*
 356                  * This code gives a bonus to interactive tasks.
 357                  *
 358                  * The boost works by updating the 'average sleep time'
 359                  * value here, based on ->last_run. The more time a task
 360                  * spends sleeping, the higher the average gets - and the
 361                  * higher the priority boost gets as well.
 362                  */
 363                 sleep_avg = p->sleep_avg + sleep_time;
 364
 365                 /*
 366                  * 'Overflow' bonus ticks go to the waker as well, so the
 367                  * ticks are not lost. This has the effect of further
 368                  * boosting tasks that are related to maximum-interactive
 369                  * tasks.
 370                  */
 371                 if (sleep_avg > MAX_SLEEP_AVG)
 372                         sleep_avg = MAX_SLEEP_AVG;
 373                 if (p->sleep_avg != sleep_avg) {
 374                         p->sleep_avg = sleep_avg;
 375                         p->prio = effective_prio(p);
 376                 }
 377         }
 378         __activate_task(p, rq);
 379 }
 380
 381 /*
 382  * deactivate_task - remove a task from the runqueue.
 383  */
 384 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 385 {
 386         nr_running_dec(rq);
 387         if (p->state == TASK_UNINTERRUPTIBLE)
 388                 rq->nr_uninterruptible++;
 389         dequeue_task(p, p->array);
 390         p->array = NULL;
 391 }
 392
 393 /*
 394  * resched_task - mark a task 'to be rescheduled now'.
 395  *
 396  * On UP this means the setting of the need_resched flag, on SMP it
 397  * might also involve a cross-CPU call to trigger the scheduler on
 398  * the target CPU.
 399  */
 400 static inline void resched_task(task_t *p)
 401 {
 402 #ifdef CONFIG_SMP
 403         int need_resched, nrpolling;
 404
 405         preempt_disable();
 406         /* minimise the chance of sending an interrupt to poll_idle() */
 407         nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 408         need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
 409         nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 410
 411         if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
 412                 smp_send_reschedule(task_cpu(p));
 413         preempt_enable();
 414 #else
 415         set_tsk_need_resched(p);
 416 #endif
 417 }
 418
 419 #ifdef CONFIG_SMP
 420
 421 /*
 422  * wait_task_inactive - wait for a thread to unschedule.
 423  *
 424  * The caller must ensure that the task *will* unschedule sometime soon,
 425  * else this function might spin for a *long* time. This function can't
 426  * be called with interrupts off, or it may introduce deadlock with
 427  * smp_call_function() if an IPI is sent by the same process we are
 428  * waiting to become inactive.
 429  */
 430 void wait_task_inactive(task_t * p)
 431 {
 432         unsigned long flags;
 433         runqueue_t *rq;
 434
 435 repeat:
 436         preempt_disable();
 437         rq = task_rq(p);
 438         if (unlikely(task_running(rq, p))) {
 439                 cpu_relax();
 440                 /*
 441                  * enable/disable preemption just to make this
 442                  * a preemption point - we are busy-waiting
 443                  * anyway.
 444                  */
 445                 preempt_enable();
 446                 goto repeat;
 447         }
 448         rq = task_rq_lock(p, &flags);
 449         if (unlikely(task_running(rq, p))) {
 450                 task_rq_unlock(rq, &flags);
 451                 preempt_enable();
 452                 goto repeat;
 453         }
 454         task_rq_unlock(rq, &flags);
 455         preempt_enable();
 456 }
 457 #endif
 458
 459 /***
 460  * try_to_wake_up - wake up a thread
 461  * @p: the to-be-woken-up thread
 462  * @state: the mask of task states that can be woken
 463  * @sync: do a synchronous wakeup?
 464  * @kick: kick the CPU if the task is already running?
 465  *
 466  * Put it on the run-queue if it's not already there. The "current"
 467  * thread is always on the run-queue (except when the actual
 468  * re-schedule is in progress), and as such you're allowed to do
 469  * the simpler "current->state = TASK_RUNNING" to mark yourself
 470  * runnable without the overhead of this.
 471  *
 472  * returns failure only if the task is already active.
 473  */
 474 static int try_to_wake_up(task_t * p, unsigned int state, int sync, int kick)
 475 {
 476         unsigned long flags;
 477         int success = 0;
 478         long old_state;
 479         runqueue_t *rq;
 480
 481 repeat_lock_task:
 482         rq = task_rq_lock(p, &flags);
 483         old_state = p->state;
 484         if (old_state & state) {
 485                 if (!p->array) {
 486                         /*
 487                          * Fast-migrate the task if it's not running or runnable
 488                          * currently. Do not violate hard affinity.
 489                          */
 490                         if (unlikely(sync && !task_running(rq, p) &&
 491                                 (task_cpu(p) != smp_processor_id()) &&
 492                                 (p->cpus_allowed & (1UL << smp_processor_id())))) {
 493
 494                                 set_task_cpu(p, smp_processor_id());
 495                                 task_rq_unlock(rq, &flags);
 496                                 goto repeat_lock_task;
 497                         }
 498                         if (old_state == TASK_UNINTERRUPTIBLE)
 499                                 rq->nr_uninterruptible--;
 500                         if (sync)
 501                                 __activate_task(p, rq);
 502                         else {
 503                                 activate_task(p, rq);
 504                                 if (p->prio < rq->curr->prio)
 505                                         resched_task(rq->curr);
 506                         }
 507                         success = 1;
 508                 }
 509 #ifdef CONFIG_SMP
 510                 else
 511                         if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id()))
 512                                 smp_send_reschedule(task_cpu(p));
 513 #endif
 514                 p->state = TASK_RUNNING;
 515         }
 516         task_rq_unlock(rq, &flags);
 517
 518         return success;
 519 }
 520
 521 int wake_up_process(task_t * p)
 522 {
 523         return try_to_wake_up(p, TASK_STOPPED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 0);
 524 }
 525
 526 int wake_up_process_kick(task_t * p)
 527 {
 528         return try_to_wake_up(p, TASK_STOPPED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 1);
 529 }
 530
 531 int wake_up_state(task_t *p, unsigned int state)
 532 {
 533         return try_to_wake_up(p, state, 0, 0);
 534 }
 535
 536 /*
 537  * wake_up_forked_process - wake up a freshly forked process.
 538  *
 539  * This function will do some initial scheduler statistics housekeeping
 540  * that must be done for every newly created process.
 541  */
 542 void wake_up_forked_process(task_t * p)
 543 {
 544         unsigned long flags;
 545         runqueue_t *rq = task_rq_lock(current, &flags);
 546
 547         p->state = TASK_RUNNING;
 548         /*
 549          * We decrease the sleep average of forking parents
 550          * and children as well, to keep max-interactive tasks
 551          * from forking tasks that are max-interactive.
 552          */
 553         current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
 554         p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
 555         p->prio = effective_prio(p);
 556         set_task_cpu(p, smp_processor_id());
 557
 558         if (unlikely(!current->array))
 559                 __activate_task(p, rq);
 560         else {
 561                 p->prio = current->prio;
 562                 list_add_tail(&p->run_list, &current->run_list);
 563                 p->array = current->array;
 564                 p->array->nr_active++;
 565                 nr_running_inc(rq);
 566         }
 567         task_rq_unlock(rq, &flags);
 568 }
 569
 570 /*
 571  * Potentially available exiting-child timeslices are
 572  * retrieved here - this way the parent does not get
 573  * penalized for creating too many threads.
 574  *
 575  * (this cannot be used to 'generate' timeslices
 576  * artificially, because any timeslice recovered here
 577  * was given away by the parent in the first place.)
 578  */
 579 void sched_exit(task_t * p)
 580 {
 581         unsigned long flags;
 582
 583         local_irq_save(flags);
 584         if (p->first_time_slice) {
 585                 p->parent->time_slice += p->time_slice;
 586                 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
 587                         p->parent->time_slice = MAX_TIMESLICE;
 588         }
 589         local_irq_restore(flags);
 590         /*
 591          * If the child was a (relative-) CPU hog then decrease
 592          * the sleep_avg of the parent as well.
 593          */
 594         if (p->sleep_avg < p->parent->sleep_avg)
 595                 p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
 596                         p->sleep_avg) / (EXIT_WEIGHT + 1);
 597 }
 598
 599 /**
 600  * finish_task_switch - clean up after a task-switch
 601  * @prev: the thread we just switched away from.
 602  *
 603  * We enter this with the runqueue still locked, and finish_arch_switch()
 604  * will unlock it along with doing any other architecture-specific cleanup
 605  * actions.
 606  *
 607  * Note that we may have delayed dropping an mm in context_switch(). If
 608  * so, we finish that here outside of the runqueue lock.  (Doing it
 609  * with the lock held can cause deadlocks; see schedule() for
 610  * details.)
 611  */
 612 static inline void finish_task_switch(task_t *prev)
 613 {
 614         runqueue_t *rq = this_rq();
 615         struct mm_struct *mm = rq->prev_mm;
 616
 617         rq->prev_mm = NULL;
 618         finish_arch_switch(rq, prev);
 619         if (mm)
 620                 mmdrop(mm);
 621         if (prev->state & (TASK_DEAD | TASK_ZOMBIE))
 622                 put_task_struct(prev);
 623 }
 624
 625 /**
 626  * schedule_tail - first thing a freshly forked thread must call.
 627  * @prev: the thread we just switched away from.
 628  */
 629 asmlinkage void schedule_tail(task_t *prev)
 630 {
 631         finish_task_switch(prev);
 632
 633         if (current->set_child_tid)
 634                 put_user(current->pid, current->set_child_tid);
 635 }
 636
 637 /*
 638  * context_switch - switch to the new MM and the new
 639  * thread's register state.
 640  */
 641 static inline task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
 642 {
 643         struct mm_struct *mm = next->mm;
 644         struct mm_struct *oldmm = prev->active_mm;
 645
 646         if (unlikely(!mm)) {
 647                 next->active_mm = oldmm;
 648                 atomic_inc(&oldmm->mm_count);
 649                 enter_lazy_tlb(oldmm, next);
 650         } else
 651                 switch_mm(oldmm, mm, next);
 652
 653         if (unlikely(!prev->mm)) {
 654                 prev->active_mm = NULL;
 655                 WARN_ON(rq->prev_mm);
 656                 rq->prev_mm = oldmm;
 657         }
 658
 659         /* Here we just switch the register state and the stack. */
 660         switch_to(prev, next, prev);
 661
 662         return prev;
 663 }
 664
 665 /*
 666  * nr_running, nr_uninterruptible and nr_context_switches:
 667  *
 668  * externally visible scheduler statistics: current number of runnable
 669  * threads, current number of uninterruptible-sleeping threads, total
 670  * number of context switches performed since bootup.
 671  */
 672 unsigned long nr_running(void)
 673 {
 674         unsigned long i, sum = 0;
 675
 676         for (i = 0; i < NR_CPUS; i++)
 677                 sum += cpu_rq(i)->nr_running;
 678
 679         return sum;
 680 }
 681
 682 unsigned long nr_uninterruptible(void)
 683 {
 684         unsigned long i, sum = 0;
 685
 686         for (i = 0; i < NR_CPUS; i++) {
 687                 if (!cpu_online(i))
 688                         continue;
 689                 sum += cpu_rq(i)->nr_uninterruptible;
 690         }
 691         return sum;
 692 }
 693
 694 unsigned long nr_context_switches(void)
 695 {
 696         unsigned long i, sum = 0;
 697
 698         for (i = 0; i < NR_CPUS; i++) {
 699                 if (!cpu_online(i))
 700                         continue;
 701                 sum += cpu_rq(i)->nr_switches;
 702         }
 703         return sum;
 704 }
 705
 706 unsigned long nr_iowait(void)
 707 {
 708         unsigned long i, sum = 0;
 709
 710         for (i = 0; i < NR_CPUS; ++i) {
 711                 if (!cpu_online(i))
 712                         continue;
 713                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
 714         }
 715         return sum;
 716 }
 717
 718 /*
 719  * double_rq_lock - safely lock two runqueues
 720  *
 721  * Note this does not disable interrupts like task_rq_lock,
 722  * you need to do so manually before calling.
 723  */
 724 static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
 725 {
 726         if (rq1 == rq2)
 727                 spin_lock(&rq1->lock);
 728         else {
 729                 if (rq1 < rq2) {
 730                         spin_lock(&rq1->lock);
 731                         spin_lock(&rq2->lock);
 732                 } else {
 733                         spin_lock(&rq2->lock);
 734                         spin_lock(&rq1->lock);
 735                 }
 736         }
 737 }
 738
 739 /*
 740  * double_rq_unlock - safely unlock two runqueues
 741  *
 742  * Note this does not restore interrupts like task_rq_unlock,
 743  * you need to do so manually after calling.
 744  */
 745 static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
 746 {
 747         spin_unlock(&rq1->lock);
 748         if (rq1 != rq2)
 749                 spin_unlock(&rq2->lock);
 750 }
 751
 752 #ifdef CONFIG_NUMA
 753 /*
 754  * If dest_cpu is allowed for this process, migrate the task to it.
 755  * This is accomplished by forcing the cpu_allowed mask to only
 756  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
 757  * the cpu_allowed mask is restored.
 758  */
 759 static void sched_migrate_task(task_t *p, int dest_cpu)
 760 {
 761         unsigned long old_mask;
 762
 763         old_mask = p->cpus_allowed;
 764         if (!(old_mask & (1UL << dest_cpu)))
 765                 return;
 766         /* force the process onto the specified CPU */
 767         set_cpus_allowed(p, 1UL << dest_cpu);
 768
 769         /* restore the cpus allowed mask */
 770         set_cpus_allowed(p, old_mask);
 771 }
 772
 773 /*
 774  * Find the least loaded CPU.  Slightly favor the current CPU by
 775  * setting its runqueue length as the minimum to start.
 776  */
 777 static int sched_best_cpu(struct task_struct *p)
 778 {
 779         int i, minload, load, best_cpu, node = 0;
 780         unsigned long cpumask;
 781
 782         best_cpu = task_cpu(p);
 783         if (cpu_rq(best_cpu)->nr_running <= 2)
 784                 return best_cpu;
 785
 786         minload = 10000000;
 787         for_each_node_with_cpus(i) {
 788                 /*
 789                  * Node load is always divided by nr_cpus_node to normalise
 790                  * load values in case cpu count differs from node to node.
 791                  * We first multiply node_nr_running by 10 to get a little
 792                  * better resolution.
 793                  */
 794                 load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i);
 795                 if (load < minload) {
 796                         minload = load;
 797                         node = i;
 798                 }
 799         }
 800
 801         minload = 10000000;
 802         cpumask = node_to_cpumask(node);
 803         for (i = 0; i < NR_CPUS; ++i) {
 804                 if (!(cpumask & (1UL << i)))
 805                         continue;
 806                 if (cpu_rq(i)->nr_running < minload) {
 807                         best_cpu = i;
 808                         minload = cpu_rq(i)->nr_running;
 809                 }
 810         }
 811         return best_cpu;
 812 }
 813
 814 void sched_balance_exec(void)
 815 {
 816         int new_cpu;
 817
 818         if (numnodes > 1) {
 819                 new_cpu = sched_best_cpu(current);
 820                 if (new_cpu != smp_processor_id())
 821                         sched_migrate_task(current, new_cpu);
 822         }
 823 }
 824
 825 /*
 826  * Find the busiest node. All previous node loads contribute with a
 827  * geometrically deccaying weight to the load measure:
 828  *      load_{t} = load_{t-1}/2 + nr_node_running_{t}
 829  * This way sudden load peaks are flattened out a bit.
 830  * Node load is divided by nr_cpus_node() in order to compare nodes
 831  * of different cpu count but also [first] multiplied by 10 to
 832  * provide better resolution.
 833  */
 834 static int find_busiest_node(int this_node)
 835 {
 836         int i, node = -1, load, this_load, maxload;
 837
 838         if (!nr_cpus_node(this_node))
 839                 return node;
 840         this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1)
 841                 + (10 * atomic_read(&node_nr_running[this_node])
 842                 / nr_cpus_node(this_node));
 843         this_rq()->prev_node_load[this_node] = this_load;
 844         for_each_node_with_cpus(i) {
 845                 if (i == this_node)
 846                         continue;
 847                 load = (this_rq()->prev_node_load[i] >> 1)
 848                         + (10 * atomic_read(&node_nr_running[i])
 849                         / nr_cpus_node(i));
 850                 this_rq()->prev_node_load[i] = load;
 851                 if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) {
 852                         maxload = load;
 853                         node = i;
 854                 }
 855         }
 856         return node;
 857 }
 858
 859 #endif /* CONFIG_NUMA */
 860
 861 #ifdef CONFIG_SMP
 862
 863 /*
 864  * double_lock_balance - lock the busiest runqueue
 865  *
 866  * this_rq is locked already. Recalculate nr_running if we have to
 867  * drop the runqueue lock.
 868  */
 869 static inline unsigned int double_lock_balance(runqueue_t *this_rq,
 870         runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running)
 871 {
 872         if (unlikely(!spin_trylock(&busiest->lock))) {
 873                 if (busiest < this_rq) {
 874                         spin_unlock(&this_rq->lock);
 875                         spin_lock(&busiest->lock);
 876                         spin_lock(&this_rq->lock);
 877                         /* Need to recalculate nr_running */
 878                         if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
 879                                 nr_running = this_rq->nr_running;
 880                         else
 881                                 nr_running = this_rq->prev_cpu_load[this_cpu];
 882                 } else
 883                         spin_lock(&busiest->lock);
 884         }
 885         return nr_running;
 886 }
 887
 888 /*
 889  * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
 890  */
 891 static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance, unsigned long cpumask)
 892 {
 893         int nr_running, load, max_load, i;
 894         runqueue_t *busiest, *rq_src;
 895
 896         /*
 897          * We search all runqueues to find the most busy one.
 898          * We do this lockless to reduce cache-bouncing overhead,
 899          * we re-check the 'best' source CPU later on again, with
 900          * the lock held.
 901          *
 902          * We fend off statistical fluctuations in runqueue lengths by
 903          * saving the runqueue length during the previous load-balancing
 904          * operation and using the smaller one the current and saved lengths.
 905          * If a runqueue is long enough for a longer amount of time then
 906          * we recognize it and pull tasks from it.
 907          *
 908          * The 'current runqueue length' is a statistical maximum variable,
 909          * for that one we take the longer one - to avoid fluctuations in
 910          * the other direction. So for a load-balance to happen it needs
 911          * stable long runqueue on the target CPU and stable short runqueue
 912          * on the local runqueue.
 913          *
 914          * We make an exception if this CPU is about to become idle - in
 915          * that case we are less picky about moving a task across CPUs and
 916          * take what can be taken.
 917          */
 918         if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
 919                 nr_running = this_rq->nr_running;
 920         else
 921                 nr_running = this_rq->prev_cpu_load[this_cpu];
 922
 923         busiest = NULL;
 924         max_load = 1;
 925         for (i = 0; i < NR_CPUS; i++) {
 926                 if (!((1UL << i) & cpumask))
 927                         continue;
 928
 929                 rq_src = cpu_rq(i);
 930                 if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i]))
 931                         load = rq_src->nr_running;
 932                 else
 933                         load = this_rq->prev_cpu_load[i];
 934                 this_rq->prev_cpu_load[i] = rq_src->nr_running;
 935
 936                 if ((load > max_load) && (rq_src != this_rq)) {
 937                         busiest = rq_src;
 938                         max_load = load;
 939                 }
 940         }
 941
 942         if (likely(!busiest))
 943                 goto out;
 944
 945         *imbalance = (max_load - nr_running) / 2;
 946
 947         /* It needs an at least ~25% imbalance to trigger balancing. */
 948         if (!idle && (*imbalance < (max_load + 3)/4)) {
 949                 busiest = NULL;
 950                 goto out;
 951         }
 952
 953         nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
 954         /*
 955          * Make sure nothing changed since we checked the
 956          * runqueue length.
 957          */
 958         if (busiest->nr_running <= nr_running + 1) {
 959                 spin_unlock(&busiest->lock);
 960                 busiest = NULL;
 961         }
 962 out:
 963         return busiest;
 964 }
 965
 966 /*
 967  * pull_task - move a task from a remote runqueue to the local runqueue.
 968  * Both runqueues must be locked.
 969  */
 970 static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
 971 {
 972         dequeue_task(p, src_array);
 973         nr_running_dec(src_rq);
 974         set_task_cpu(p, this_cpu);
 975         nr_running_inc(this_rq);
 976         enqueue_task(p, this_rq->active);
 977         /*
 978          * Note that idle threads have a prio of MAX_PRIO, for this test
 979          * to be always true for them.
 980          */
 981         if (p->prio < this_rq->curr->prio)
 982                 set_need_resched();
 983         else {
 984                 if (p->prio == this_rq->curr->prio &&
 985                                 p->time_slice > this_rq->curr->time_slice)
 986                         set_need_resched();
 987         }
 988 }
 989
 990 /*
 991  * Current runqueue is empty, or rebalance tick: if there is an
 992  * inbalance (current runqueue is too short) then pull from
 993  * busiest runqueue(s).
 994  *
 995  * We call this with the current runqueue locked,
 996  * irqs disabled.
 997  */
 998 static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask)
 999 {
1000         int imbalance, idx, this_cpu = smp_processor_id();
1001         runqueue_t *busiest;
1002         prio_array_t *array;
1003         struct list_head *head, *curr;
1004         task_t *tmp;
1005
1006         busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
1007         if (!busiest)
1008                 goto out;
1009
1010         /*
1011          * We first consider expired tasks. Those will likely not be
1012          * executed in the near future, and they are most likely to
1013          * be cache-cold, thus switching CPUs has the least effect
1014          * on them.
1015          */
1016         if (busiest->expired->nr_active)
1017                 array = busiest->expired;
1018         else
1019                 array = busiest->active;
1020
1021 new_array:
1022         /* Start searching at priority 0: */
1023         idx = 0;
1024 skip_bitmap:
1025         if (!idx)
1026                 idx = sched_find_first_bit(array->bitmap);
1027         else
1028                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1029         if (idx >= MAX_PRIO) {
1030                 if (array == busiest->expired) {
1031                         array = busiest->active;
1032                         goto new_array;
1033                 }
1034                 goto out_unlock;
1035         }
1036
1037         head = array->queue + idx;
1038         curr = head->prev;
1039 skip_queue:
1040         tmp = list_entry(curr, task_t, run_list);
1041
1042         /*
1043          * We do not migrate tasks that are:
1044          * 1) running (obviously), or
1045          * 2) cannot be migrated to this CPU due to cpus_allowed, or
1046          * 3) are cache-hot on their current CPU.
1047          */
1048
1049 #define CAN_MIGRATE_TASK(p,rq,this_cpu)                                 \
1050         ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) &&    \
1051                 !task_running(rq, p) &&                                 \
1052                         ((p)->cpus_allowed & (1UL << (this_cpu))))
1053
1054         curr = curr->prev;
1055
1056         if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
1057                 if (curr != head)
1058                         goto skip_queue;
1059                 idx++;
1060                 goto skip_bitmap;
1061         }
1062         pull_task(busiest, array, tmp, this_rq, this_cpu);
1063         if (!idle && --imbalance) {
1064                 if (curr != head)
1065                         goto skip_queue;
1066                 idx++;
1067                 goto skip_bitmap;
1068         }
1069 out_unlock:
1070         spin_unlock(&busiest->lock);
1071 out:
1072         ;
1073 }
1074
1075 /*
1076  * One of the idle_cpu_tick() and busy_cpu_tick() functions will
1077  * get called every timer tick, on every CPU. Our balancing action
1078  * frequency and balancing agressivity depends on whether the CPU is
1079  * idle or not.
1080  *
1081  * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
1082  * systems with HZ=100, every 10 msecs.)
1083  *
1084  * On NUMA, do a node-rebalance every 400 msecs.
1085  */
1086 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1087 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
1088 #define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
1089 #define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
1090
1091 #ifdef CONFIG_NUMA
1092 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
1093 {
1094         int node = find_busiest_node(cpu_to_node(this_cpu));
1095         unsigned long cpumask, this_cpumask = 1UL << this_cpu;
1096
1097         if (node >= 0) {
1098                 cpumask = node_to_cpumask(node) | this_cpumask;
1099                 spin_lock(&this_rq->lock);
1100                 load_balance(this_rq, idle, cpumask);
1101                 spin_unlock(&this_rq->lock);
1102         }
1103 }
1104 #endif
1105
1106 static void rebalance_tick(runqueue_t *this_rq, int idle)
1107 {
1108 #ifdef CONFIG_NUMA
1109         int this_cpu = smp_processor_id();
1110 #endif
1111         unsigned long j = jiffies;
1112
1113         /*
1114          * First do inter-node rebalancing, then intra-node rebalancing,
1115          * if both events happen in the same tick. The inter-node
1116          * rebalancing does not necessarily have to create a perfect
1117          * balance within the node, since we load-balance the most loaded
1118          * node with the current CPU. (ie. other CPUs in the local node
1119          * are not balanced.)
1120          */
1121         if (idle) {
1122 #ifdef CONFIG_NUMA
1123                 if (!(j % IDLE_NODE_REBALANCE_TICK))
1124                         balance_node(this_rq, idle, this_cpu);
1125 #endif
1126                 if (!(j % IDLE_REBALANCE_TICK)) {
1127                         spin_lock(&this_rq->lock);
1128                         load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1129                         spin_unlock(&this_rq->lock);
1130                 }
1131                 return;
1132         }
1133 #ifdef CONFIG_NUMA
1134         if (!(j % BUSY_NODE_REBALANCE_TICK))
1135                 balance_node(this_rq, idle, this_cpu);
1136 #endif
1137         if (!(j % BUSY_REBALANCE_TICK)) {
1138                 spin_lock(&this_rq->lock);
1139                 load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1140                 spin_unlock(&this_rq->lock);
1141         }
1142 }
1143 #else
1144 /*
1145  * on UP we do not need to balance between CPUs:
1146  */
1147 static inline void rebalance_tick(runqueue_t *this_rq, int idle)
1148 {
1149 }
1150 #endif
1151
1152 DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
1153
1154 EXPORT_PER_CPU_SYMBOL(kstat);
1155
1156 /*
1157  * We place interactive tasks back into the active array, if possible.
1158  *
1159  * To guarantee that this does not starve expired tasks we ignore the
1160  * interactivity of a task if the first expired task had to wait more
1161  * than a 'reasonable' amount of time. This deadline timeout is
1162  * load-dependent, as the frequency of array switched decreases with
1163  * increasing number of running tasks:
1164  */
1165 #define EXPIRED_STARVING(rq) \
1166                 (STARVATION_LIMIT && ((rq)->expired_timestamp && \
1167                 (jiffies - (rq)->expired_timestamp >= \
1168                         STARVATION_LIMIT * ((rq)->nr_running) + 1)))
1169
1170 /*
1171  * This function gets called by the timer code, with HZ frequency.
1172  * We call it with interrupts disabled.
1173  *
1174  * It also gets called by the fork code, when changing the parent's
1175  * timeslices.
1176  */
1177 void scheduler_tick(int user_ticks, int sys_ticks)
1178 {
1179         int cpu = smp_processor_id();
1180         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1181         runqueue_t *rq = this_rq();
1182         task_t *p = current;
1183
1184         if (rcu_pending(cpu))
1185                 rcu_check_callbacks(cpu, user_ticks);
1186
1187         if (p == rq->idle) {
1188                 /* note: this timer irq context must be accounted for as well */
1189                 if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
1190                         cpustat->system += sys_ticks;
1191                 else if (atomic_read(&rq->nr_iowait) > 0)
1192                         cpustat->iowait += sys_ticks;
1193                 else
1194                         cpustat->idle += sys_ticks;
1195                 rebalance_tick(rq, 1);
1196                 return;
1197         }
1198         if (TASK_NICE(p) > 0)
1199                 cpustat->nice += user_ticks;
1200         else
1201                 cpustat->user += user_ticks;
1202         cpustat->system += sys_ticks;
1203
1204         /* Task might have expired already, but not scheduled off yet */
1205         if (p->array != rq->active) {
1206                 set_tsk_need_resched(p);
1207                 goto out;
1208         }
1209         spin_lock(&rq->lock);
1210         /*
1211          * The task was running during this tick - update the
1212          * time slice counter and the sleep average. Note: we
1213          * do not update a thread's priority until it either
1214          * goes to sleep or uses up its timeslice. This makes
1215          * it possible for interactive tasks to use up their
1216          * timeslices at their highest priority levels.
1217          */
1218         if (p->sleep_avg)
1219                 p->sleep_avg--;
1220         if (unlikely(rt_task(p))) {
1221                 /*
1222                  * RR tasks need a special form of timeslice management.
1223                  * FIFO tasks have no timeslices.
1224                  */
1225                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
1226                         p->time_slice = task_timeslice(p);
1227                         p->first_time_slice = 0;
1228                         set_tsk_need_resched(p);
1229
1230                         /* put it at the end of the queue: */
1231                         dequeue_task(p, rq->active);
1232                         enqueue_task(p, rq->active);
1233                 }
1234                 goto out_unlock;
1235         }
1236         if (!--p->time_slice) {
1237                 dequeue_task(p, rq->active);
1238                 set_tsk_need_resched(p);
1239                 p->prio = effective_prio(p);
1240                 p->time_slice = task_timeslice(p);
1241                 p->first_time_slice = 0;
1242
1243                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1244                         if (!rq->expired_timestamp)
1245                                 rq->expired_timestamp = jiffies;
1246                         enqueue_task(p, rq->expired);
1247                 } else
1248                         enqueue_task(p, rq->active);
1249         }
1250 out_unlock:
1251         spin_unlock(&rq->lock);
1252 out:
1253         rebalance_tick(rq, 0);
1254 }
1255
1256 void scheduling_functions_start_here(void) { }
1257
1258 /*
1259  * schedule() is the main scheduler function.
1260  */
1261 asmlinkage void schedule(void)
1262 {
1263         task_t *prev, *next;
1264         runqueue_t *rq;
1265         prio_array_t *array;
1266         struct list_head *queue;
1267         int idx;
1268
1269         /*
1270          * Test if we are atomic.  Since do_exit() needs to call into
1271          * schedule() atomically, we ignore that path for now.
1272          * Otherwise, whine if we are scheduling when we should not be.
1273          */
1274         if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
1275                 if (unlikely(in_atomic())) {
1276                         printk(KERN_ERR "bad: scheduling while atomic!\n");
1277                         dump_stack();
1278                 }
1279         }
1280
1281 need_resched:
1282         preempt_disable();
1283         prev = current;
1284         rq = this_rq();
1285
1286         release_kernel_lock(prev);
1287         prev->last_run = jiffies;
1288         spin_lock_irq(&rq->lock);
1289
1290         /*
1291          * if entering off of a kernel preemption go straight
1292          * to picking the next task.
1293          */
1294         if (unlikely(preempt_count() & PREEMPT_ACTIVE))
1295                 goto pick_next_task;
1296
1297         switch (prev->state) {
1298         case TASK_INTERRUPTIBLE:
1299                 if (unlikely(signal_pending(prev))) {
1300                         prev->state = TASK_RUNNING;
1301                         break;
1302                 }
1303         default:
1304                 deactivate_task(prev, rq);
1305         case TASK_RUNNING:
1306                 ;
1307         }
1308 pick_next_task:
1309         if (unlikely(!rq->nr_running)) {
1310 #ifdef CONFIG_SMP
1311                 load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
1312                 if (rq->nr_running)
1313                         goto pick_next_task;
1314 #endif
1315                 next = rq->idle;
1316                 rq->expired_timestamp = 0;
1317                 goto switch_tasks;
1318         }
1319
1320         array = rq->active;
1321         if (unlikely(!array->nr_active)) {
1322                 /*
1323                  * Switch the active and expired arrays.
1324                  */
1325                 rq->active = rq->expired;
1326                 rq->expired = array;
1327                 array = rq->active;
1328                 rq->expired_timestamp = 0;
1329         }
1330
1331         idx = sched_find_first_bit(array->bitmap);
1332         queue = array->queue + idx;
1333         next = list_entry(queue->next, task_t, run_list);
1334
1335 switch_tasks:
1336         prefetch(next);
1337         clear_tsk_need_resched(prev);
1338         RCU_qsctr(task_cpu(prev))++;
1339
1340         if (likely(prev != next)) {
1341                 rq->nr_switches++;
1342                 rq->curr = next;
1343
1344                 prepare_arch_switch(rq, next);
1345                 prev = context_switch(rq, prev, next);
1346                 barrier();
1347
1348                 finish_task_switch(prev);
1349         } else
1350                 spin_unlock_irq(&rq->lock);
1351
1352         reacquire_kernel_lock(current);
1353         preempt_enable_no_resched();
1354         if (test_thread_flag(TIF_NEED_RESCHED))
1355                 goto need_resched;
1356 }
1357
1358 #ifdef CONFIG_PREEMPT
1359 /*
1360  * this is is the entry point to schedule() from in-kernel preemption
1361  * off of preempt_enable.  Kernel preemptions off return from interrupt
1362  * occur there and call schedule directly.
1363  */
1364 asmlinkage void preempt_schedule(void)
1365 {
1366         struct thread_info *ti = current_thread_info();
1367
1368         /*
1369          * If there is a non-zero preempt_count or interrupts are disabled,
1370          * we do not want to preempt the current task.  Just return..
1371          */
1372         if (unlikely(ti->preempt_count || irqs_disabled()))
1373                 return;
1374
1375 need_resched:
1376         ti->preempt_count = PREEMPT_ACTIVE;
1377         schedule();
1378         ti->preempt_count = 0;
1379
1380         /* we could miss a preemption opportunity between schedule and now */
1381         barrier();
1382         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
1383                 goto need_resched;
1384 }
1385 #endif /* CONFIG_PREEMPT */
1386
1387 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync)
1388 {
1389         task_t *p = curr->task;
1390         return try_to_wake_up(p, mode, sync, 0);
1391 }
1392
1393 /*
1394  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
1395  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
1396  * number) then we wake all the non-exclusive tasks and one exclusive task.
1397  *
1398  * There are circumstances in which we can try to wake a task which has already
1399  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
1400  * zero in this (rare) case, and we handle it by continuing to scan the queue.
1401  */
1402 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync)
1403 {
1404         struct list_head *tmp, *next;
1405
1406         list_for_each_safe(tmp, next, &q->task_list) {
1407                 wait_queue_t *curr;
1408                 unsigned flags;
1409                 curr = list_entry(tmp, wait_queue_t, task_list);
1410                 flags = curr->flags;
1411                 if (curr->func(curr, mode, sync) &&
1412                     (flags & WQ_FLAG_EXCLUSIVE) &&
1413                     !--nr_exclusive)
1414                         break;
1415         }
1416 }
1417
1418 /**
1419  * __wake_up - wake up threads blocked on a waitqueue.
1420  * @q: the waitqueue
1421  * @mode: which threads
1422  * @nr_exclusive: how many wake-one or wake-many threads to wake up
1423  */
1424 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1425 {
1426         unsigned long flags;
1427
1428         spin_lock_irqsave(&q->lock, flags);
1429         __wake_up_common(q, mode, nr_exclusive, 0);
1430         spin_unlock_irqrestore(&q->lock, flags);
1431 }
1432
1433 /*
1434  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
1435  */
1436 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
1437 {
1438         __wake_up_common(q, mode, 1, 0);
1439 }
1440
1441 /**
1442  * __wake_up - sync- wake up threads blocked on a waitqueue.
1443  * @q: the waitqueue
1444  * @mode: which threads
1445  * @nr_exclusive: how many wake-one or wake-many threads to wake up
1446  *
1447  * The sync wakeup differs that the waker knows that it will schedule
1448  * away soon, so while the target thread will be woken up, it will not
1449  * be migrated to another CPU - ie. the two threads are 'synchronized'
1450  * with each other. This can prevent needless bouncing between CPUs.
1451  *
1452  * On UP it can prevent extra preemption.
1453  */
1454 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1455 {
1456         unsigned long flags;
1457
1458         if (unlikely(!q))
1459                 return;
1460
1461         spin_lock_irqsave(&q->lock, flags);
1462         if (likely(nr_exclusive))
1463                 __wake_up_common(q, mode, nr_exclusive, 1);
1464         else
1465                 __wake_up_common(q, mode, nr_exclusive, 0);
1466         spin_unlock_irqrestore(&q->lock, flags);
1467 }
1468
1469 void complete(struct completion *x)
1470 {
1471         unsigned long flags;
1472
1473         spin_lock_irqsave(&x->wait.lock, flags);
1474         x->done++;
1475         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0);
1476         spin_unlock_irqrestore(&x->wait.lock, flags);
1477 }
1478
1479 void complete_all(struct completion *x)
1480 {
1481         unsigned long flags;
1482
1483         spin_lock_irqsave(&x->wait.lock, flags);
1484         x->done += UINT_MAX/2;
1485         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0, 0);
1486         spin_unlock_irqrestore(&x->wait.lock, flags);
1487 }
1488
1489 void wait_for_completion(struct completion *x)
1490 {
1491         might_sleep();
1492         spin_lock_irq(&x->wait.lock);
1493         if (!x->done) {
1494                 DECLARE_WAITQUEUE(wait, current);
1495
1496                 wait.flags |= WQ_FLAG_EXCLUSIVE;
1497                 __add_wait_queue_tail(&x->wait, &wait);
1498                 do {
1499                         __set_current_state(TASK_UNINTERRUPTIBLE);
1500                         spin_unlock_irq(&x->wait.lock);
1501                         schedule();
1502                         spin_lock_irq(&x->wait.lock);
1503                 } while (!x->done);
1504                 __remove_wait_queue(&x->wait, &wait);
1505         }
1506         x->done--;
1507         spin_unlock_irq(&x->wait.lock);
1508 }
1509
1510 #define SLEEP_ON_VAR                            \
1511         unsigned long flags;                    \
1512         wait_queue_t wait;                      \
1513         init_waitqueue_entry(&wait, current);
1514
1515 #define SLEEP_ON_HEAD                                   \
1516         spin_lock_irqsave(&q->lock,flags);              \
1517         __add_wait_queue(q, &wait);                     \
1518         spin_unlock(&q->lock);
1519
1520 #define SLEEP_ON_TAIL                                           \
1521         spin_lock_irq(&q->lock);                                \
1522         __remove_wait_queue(q, &wait);                          \
1523         spin_unlock_irqrestore(&q->lock, flags);
1524
1525 void interruptible_sleep_on(wait_queue_head_t *q)
1526 {
1527         SLEEP_ON_VAR
1528
1529         current->state = TASK_INTERRUPTIBLE;
1530
1531         SLEEP_ON_HEAD
1532         schedule();
1533         SLEEP_ON_TAIL
1534 }
1535
1536 long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1537 {
1538         SLEEP_ON_VAR
1539
1540         current->state = TASK_INTERRUPTIBLE;
1541
1542         SLEEP_ON_HEAD
1543         timeout = schedule_timeout(timeout);
1544         SLEEP_ON_TAIL
1545
1546         return timeout;
1547 }
1548
1549 void sleep_on(wait_queue_head_t *q)
1550 {
1551         SLEEP_ON_VAR
1552
1553         current->state = TASK_UNINTERRUPTIBLE;
1554
1555         SLEEP_ON_HEAD
1556         schedule();
1557         SLEEP_ON_TAIL
1558 }
1559
1560 long sleep_on_timeout(wait_queue_head_t *q, long timeout)
1561 {
1562         SLEEP_ON_VAR
1563
1564         current->state = TASK_UNINTERRUPTIBLE;
1565
1566         SLEEP_ON_HEAD
1567         timeout = schedule_timeout(timeout);
1568         SLEEP_ON_TAIL
1569
1570         return timeout;
1571 }
1572
1573 void scheduling_functions_end_here(void) { }
1574
1575 void set_user_nice(task_t *p, long nice)
1576 {
1577         unsigned long flags;
1578         prio_array_t *array;
1579         runqueue_t *rq;
1580
1581         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1582                 return;
1583         /*
1584          * We have to be careful, if called from sys_setpriority(),
1585          * the task might be in the middle of scheduling on another CPU.
1586          */
1587         rq = task_rq_lock(p, &flags);
1588         if (rt_task(p)) {
1589                 p->static_prio = NICE_TO_PRIO(nice);
1590                 goto out_unlock;
1591         }
1592         array = p->array;
1593         if (array)
1594                 dequeue_task(p, array);
1595         p->static_prio = NICE_TO_PRIO(nice);
1596         p->prio = NICE_TO_PRIO(nice);
1597         if (array) {
1598                 enqueue_task(p, array);
1599                 /*
1600                  * If the task is running and lowered its priority,
1601                  * or increased its priority then reschedule its CPU:
1602                  */
1603                 if ((NICE_TO_PRIO(nice) < p->static_prio) ||
1604                                                         task_running(rq, p))
1605                         resched_task(rq->curr);
1606         }
1607 out_unlock:
1608         task_rq_unlock(rq, &flags);
1609 }
1610
1611 #ifndef __alpha__
1612
1613 /*
1614  * sys_nice - change the priority of the current process.
1615  * @increment: priority increment
1616  *
1617  * sys_setpriority is a more generic, but much slower function that
1618  * does similar things.
1619  */
1620 asmlinkage long sys_nice(int increment)
1621 {
1622         int retval;
1623         long nice;
1624
1625         /*
1626          *      Setpriority might change our priority at the same moment.
1627          *      We don't have to worry. Conceptually one call occurs first
1628          *      and we have a single winner.
1629          */
1630         if (increment < 0) {
1631                 if (!capable(CAP_SYS_NICE))
1632                         return -EPERM;
1633                 if (increment < -40)
1634                         increment = -40;
1635         }
1636         if (increment > 40)
1637                 increment = 40;
1638
1639         nice = PRIO_TO_NICE(current->static_prio) + increment;
1640         if (nice < -20)
1641                 nice = -20;
1642         if (nice > 19)
1643                 nice = 19;
1644
1645         retval = security_task_setnice(current, nice);
1646         if (retval)
1647                 return retval;
1648
1649         set_user_nice(current, nice);
1650         return 0;
1651 }
1652
1653 #endif
1654
1655 /**
1656  * task_prio - return the priority value of a given task.
1657  * @p: the task in question.
1658  *
1659  * This is the priority value as seen by users in /proc.
1660  * RT tasks are offset by -200. Normal tasks are centered
1661  * around 0, value goes from -16 to +15.
1662  */
1663 int task_prio(task_t *p)
1664 {
1665         return p->prio - MAX_RT_PRIO;
1666 }
1667
1668 /**
1669  * task_nice - return the nice value of a given task.
1670  * @p: the task in question.
1671  */
1672 int task_nice(task_t *p)
1673 {
1674         return TASK_NICE(p);
1675 }
1676
1677 /**
1678  * task_curr - is this task currently executing on a CPU?
1679  * @p: the task in question.
1680  */
1681 int task_curr(task_t *p)
1682 {
1683         return cpu_curr(task_cpu(p)) == p;
1684 }
1685
1686 /**
1687  * idle_cpu - is a given cpu idle currently?
1688  * @cpu: the processor in question.
1689  */
1690 int idle_cpu(int cpu)
1691 {
1692         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
1693 }
1694
1695 /**
1696  * find_process_by_pid - find a process with a matching PID value.
1697  * @pid: the pid in question.
1698  */
1699 static inline task_t *find_process_by_pid(pid_t pid)
1700 {
1701         return pid ? find_task_by_pid(pid) : current;
1702 }
1703
1704 /*
1705  * setscheduler - change the scheduling policy and/or RT priority of a thread.
1706  */
1707 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
1708 {
1709         struct sched_param lp;
1710         int retval = -EINVAL;
1711         int oldprio;
1712         prio_array_t *array;
1713         unsigned long flags;
1714         runqueue_t *rq;
1715         task_t *p;
1716
1717         if (!param || pid < 0)
1718                 goto out_nounlock;
1719
1720         retval = -EFAULT;
1721         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
1722                 goto out_nounlock;
1723
1724         /*
1725          * We play safe to avoid deadlocks.
1726          */
1727         read_lock_irq(&tasklist_lock);
1728
1729         p = find_process_by_pid(pid);
1730
1731         retval = -ESRCH;
1732         if (!p)
1733                 goto out_unlock_tasklist;
1734
1735         /*
1736          * To be able to change p->policy safely, the apropriate
1737          * runqueue lock must be held.
1738          */
1739         rq = task_rq_lock(p, &flags);
1740
1741         if (policy < 0)
1742                 policy = p->policy;
1743         else {
1744                 retval = -EINVAL;
1745                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
1746                                 policy != SCHED_NORMAL)
1747                         goto out_unlock;
1748         }
1749
1750         /*
1751          * Valid priorities for SCHED_FIFO and SCHED_RR are
1752          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
1753          */
1754         retval = -EINVAL;
1755         if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
1756                 goto out_unlock;
1757         if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
1758                 goto out_unlock;
1759
1760         retval = -EPERM;
1761         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1762             !capable(CAP_SYS_NICE))
1763                 goto out_unlock;
1764         if ((current->euid != p->euid) && (current->euid != p->uid) &&
1765             !capable(CAP_SYS_NICE))
1766                 goto out_unlock;
1767
1768         retval = security_task_setscheduler(p, policy, &lp);
1769         if (retval)
1770                 goto out_unlock;
1771
1772         array = p->array;
1773         if (array)
1774                 deactivate_task(p, task_rq(p));
1775         retval = 0;
1776         p->policy = policy;
1777         p->rt_priority = lp.sched_priority;
1778         oldprio = p->prio;
1779         if (policy != SCHED_NORMAL)
1780                 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
1781         else
1782                 p->prio = p->static_prio;
1783         if (array) {
1784                 __activate_task(p, task_rq(p));
1785                 /*
1786                  * Reschedule if we are currently running on this runqueue and
1787                  * our priority decreased, or if we are not currently running on
1788                  * this runqueue and our priority is higher than the current's
1789                  */
1790                 if (rq->curr == p) {
1791                         if (p->prio > oldprio)
1792                                 resched_task(rq->curr);
1793                 } else if (p->prio < rq->curr->prio)
1794                         resched_task(rq->curr);
1795         }
1796
1797 out_unlock:
1798         task_rq_unlock(rq, &flags);
1799 out_unlock_tasklist:
1800         read_unlock_irq(&tasklist_lock);
1801
1802 out_nounlock:
1803         return retval;
1804 }
1805
1806 /**
1807  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
1808  * @pid: the pid in question.
1809  * @policy: new policy
1810  * @param: structure containing the new RT priority.
1811  */
1812 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1813                                       struct sched_param __user *param)
1814 {
1815         return setscheduler(pid, policy, param);
1816 }
1817
1818 /**
1819  * sys_sched_setparam - set/change the RT priority of a thread
1820  * @pid: the pid in question.
1821  * @param: structure containing the new RT priority.
1822  */
1823 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
1824 {
1825         return setscheduler(pid, -1, param);
1826 }
1827
1828 /**
1829  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
1830  * @pid: the pid in question.
1831  */
1832 asmlinkage long sys_sched_getscheduler(pid_t pid)
1833 {
1834         int retval = -EINVAL;
1835         task_t *p;
1836
1837         if (pid < 0)
1838                 goto out_nounlock;
1839
1840         retval = -ESRCH;
1841         read_lock(&tasklist_lock);
1842         p = find_process_by_pid(pid);
1843         if (p) {
1844                 retval = security_task_getscheduler(p);
1845                 if (!retval)
1846                         retval = p->policy;
1847         }
1848         read_unlock(&tasklist_lock);
1849
1850 out_nounlock:
1851         return retval;
1852 }
1853
1854 /**
1855  * sys_sched_getscheduler - get the RT priority of a thread
1856  * @pid: the pid in question.
1857  * @param: structure containing the RT priority.
1858  */
1859 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
1860 {
1861         struct sched_param lp;
1862         int retval = -EINVAL;
1863         task_t *p;
1864
1865         if (!param || pid < 0)
1866                 goto out_nounlock;
1867
1868         read_lock(&tasklist_lock);
1869         p = find_process_by_pid(pid);
1870         retval = -ESRCH;
1871         if (!p)
1872                 goto out_unlock;
1873
1874         retval = security_task_getscheduler(p);
1875         if (retval)
1876                 goto out_unlock;
1877
1878         lp.sched_priority = p->rt_priority;
1879         read_unlock(&tasklist_lock);
1880
1881         /*
1882          * This one might sleep, we cannot do it with a spinlock held ...
1883          */
1884         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1885
1886 out_nounlock:
1887         return retval;
1888
1889 out_unlock:
1890         read_unlock(&tasklist_lock);
1891         return retval;
1892 }
1893
1894 /**
1895  * sys_sched_setaffinity - set the cpu affinity of a process
1896  * @pid: pid of the process
1897  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
1898  * @user_mask_ptr: user-space pointer to the new cpu mask
1899  */
1900 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
1901                                       unsigned long __user *user_mask_ptr)
1902 {
1903         unsigned long new_mask;
1904         int retval;
1905         task_t *p;
1906
1907         if (len < sizeof(new_mask))
1908                 return -EINVAL;
1909
1910         if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
1911                 return -EFAULT;
1912
1913         read_lock(&tasklist_lock);
1914
1915         p = find_process_by_pid(pid);
1916         if (!p) {
1917                 read_unlock(&tasklist_lock);
1918                 return -ESRCH;
1919         }
1920
1921         /*
1922          * It is not safe to call set_cpus_allowed with the
1923          * tasklist_lock held.  We will bump the task_struct's
1924          * usage count and then drop tasklist_lock.
1925          */
1926         get_task_struct(p);
1927         read_unlock(&tasklist_lock);
1928
1929         retval = -EPERM;
1930         if ((current->euid != p->euid) && (current->euid != p->uid) &&
1931                         !capable(CAP_SYS_NICE))
1932                 goto out_unlock;
1933
1934         retval = set_cpus_allowed(p, new_mask);
1935
1936 out_unlock:
1937         put_task_struct(p);
1938         return retval;
1939 }
1940
1941 /**
1942  * sys_sched_getaffinity - get the cpu affinity of a process
1943  * @pid: pid of the process
1944  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
1945  * @user_mask_ptr: user-space pointer to hold the current cpu mask
1946  */
1947 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
1948                                       unsigned long __user *user_mask_ptr)
1949 {
1950         unsigned int real_len;
1951         unsigned long mask;
1952         int retval;
1953         task_t *p;
1954
1955         real_len = sizeof(mask);
1956         if (len < real_len)
1957                 return -EINVAL;
1958
1959         read_lock(&tasklist_lock);
1960
1961         retval = -ESRCH;
1962         p = find_process_by_pid(pid);
1963         if (!p)
1964                 goto out_unlock;
1965
1966         retval = 0;
1967         mask = p->cpus_allowed & cpu_online_map;
1968
1969 out_unlock:
1970         read_unlock(&tasklist_lock);
1971         if (retval)
1972                 return retval;
1973         if (copy_to_user(user_mask_ptr, &mask, real_len))
1974                 return -EFAULT;
1975         return real_len;
1976 }
1977
1978 /**
1979  * sys_sched_yield - yield the current processor to other threads.
1980  *
1981  * this function yields the current CPU by moving the calling thread
1982  * to the expired array. If there are no other threads running on this
1983  * CPU then this function will return.
1984  */
1985 asmlinkage long sys_sched_yield(void)
1986 {
1987         runqueue_t *rq = this_rq_lock();
1988         prio_array_t *array = current->array;
1989
1990         /*
1991          * We implement yielding by moving the task into the expired
1992          * queue.
1993          *
1994          * (special rule: RT tasks will just roundrobin in the active
1995          *  array.)
1996          */
1997         if (likely(!rt_task(current))) {
1998                 dequeue_task(current, array);
1999                 enqueue_task(current, rq->expired);
2000         } else {
2001                 list_del(&current->run_list);
2002                 list_add_tail(&current->run_list, array->queue + current->prio);
2003         }
2004         /*
2005          * Since we are going to call schedule() anyway, there's
2006          * no need to preempt:
2007          */
2008         _raw_spin_unlock(&rq->lock);
2009         preempt_enable_no_resched();
2010
2011         schedule();
2012
2013         return 0;
2014 }
2015
2016 void __cond_resched(void)
2017 {
2018         set_current_state(TASK_RUNNING);
2019         schedule();
2020 }
2021
2022 /**
2023  * yield - yield the current processor to other threads.
2024  *
2025  * this is a shortcut for kernel-space yielding - it marks the
2026  * thread runnable and calls sys_sched_yield().
2027  */
2028 void yield(void)
2029 {
2030         set_current_state(TASK_RUNNING);
2031         sys_sched_yield();
2032 }
2033
2034 /*
2035  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
2036  * that process accounting knows that this is a task in IO wait state.
2037  *
2038  * But don't do that if it is a deliberate, throttling IO wait (this task
2039  * has set its backing_dev_info: the queue against which it should throttle)
2040  */
2041 void io_schedule(void)
2042 {
2043         struct runqueue *rq = this_rq();
2044
2045         atomic_inc(&rq->nr_iowait);
2046         schedule();
2047         atomic_dec(&rq->nr_iowait);
2048 }
2049
2050 long io_schedule_timeout(long timeout)
2051 {
2052         struct runqueue *rq = this_rq();
2053         long ret;
2054
2055         atomic_inc(&rq->nr_iowait);
2056         ret = schedule_timeout(timeout);
2057         atomic_dec(&rq->nr_iowait);
2058         return ret;
2059 }
2060
2061 /**
2062  * sys_sched_get_priority_max - return maximum RT priority.
2063  * @policy: scheduling class.
2064  *
2065  * this syscall returns the maximum rt_priority that can be used
2066  * by a given scheduling class.
2067  */
2068 asmlinkage long sys_sched_get_priority_max(int policy)
2069 {
2070         int ret = -EINVAL;
2071
2072         switch (policy) {
2073         case SCHED_FIFO:
2074         case SCHED_RR:
2075                 ret = MAX_USER_RT_PRIO-1;
2076                 break;
2077         case SCHED_NORMAL:
2078                 ret = 0;
2079                 break;
2080         }
2081         return ret;
2082 }
2083
2084 /**
2085  * sys_sched_get_priority_min - return minimum RT priority.
2086  * @policy: scheduling class.
2087  *
2088  * this syscall returns the minimum rt_priority that can be used
2089  * by a given scheduling class.
2090  */
2091 asmlinkage long sys_sched_get_priority_min(int policy)
2092 {
2093         int ret = -EINVAL;
2094
2095         switch (policy) {
2096         case SCHED_FIFO:
2097         case SCHED_RR:
2098                 ret = 1;
2099                 break;
2100         case SCHED_NORMAL:
2101                 ret = 0;
2102         }
2103         return ret;
2104 }
2105
2106 /**
2107  * sys_sched_rr_get_interval - return the default timeslice of a process.
2108  * @pid: pid of the process.
2109  * @interval: userspace pointer to the timeslice value.
2110  *
2111  * this syscall writes the default timeslice value of a given process
2112  * into the user-space timespec buffer. A value of '0' means infinity.
2113  */
2114 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
2115 {
2116         int retval = -EINVAL;
2117         struct timespec t;
2118         task_t *p;
2119
2120         if (pid < 0)
2121                 goto out_nounlock;
2122
2123         retval = -ESRCH;
2124         read_lock(&tasklist_lock);
2125         p = find_process_by_pid(pid);
2126         if (!p)
2127                 goto out_unlock;
2128
2129         retval = security_task_getscheduler(p);
2130         if (retval)
2131                 goto out_unlock;
2132
2133         jiffies_to_timespec(p->policy & SCHED_FIFO ?
2134                                 0 : task_timeslice(p), &t);
2135         read_unlock(&tasklist_lock);
2136         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
2137 out_nounlock:
2138         return retval;
2139 out_unlock:
2140         read_unlock(&tasklist_lock);
2141         return retval;
2142 }
2143
2144 static inline struct task_struct *eldest_child(struct task_struct *p)
2145 {
2146         if (list_empty(&p->children)) return NULL;
2147         return list_entry(p->children.next,struct task_struct,sibling);
2148 }
2149
2150 static inline struct task_struct *older_sibling(struct task_struct *p)
2151 {
2152         if (p->sibling.prev==&p->parent->children) return NULL;
2153         return list_entry(p->sibling.prev,struct task_struct,sibling);
2154 }
2155
2156 static inline struct task_struct *younger_sibling(struct task_struct *p)
2157 {
2158         if (p->sibling.next==&p->parent->children) return NULL;
2159         return list_entry(p->sibling.next,struct task_struct,sibling);
2160 }
2161
2162 static void show_task(task_t * p)
2163 {
2164         unsigned long free = 0;
2165         task_t *relative;
2166         int state;
2167         static const char * stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
2168
2169         printk("%-13.13s ", p->comm);
2170         state = p->state ? __ffs(p->state) + 1 : 0;
2171         if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
2172                 printk(stat_nam[state]);
2173         else
2174                 printk(" ");
2175 #if (BITS_PER_LONG == 32)
2176         if (p == current)
2177                 printk(" current  ");
2178         else
2179                 printk(" %08lX ", thread_saved_pc(p));
2180 #else
2181         if (p == current)
2182                 printk("   current task   ");
2183         else
2184                 printk(" %016lx ", thread_saved_pc(p));
2185 #endif
2186         {
2187                 unsigned long * n = (unsigned long *) (p->thread_info+1);
2188                 while (!*n)
2189                         n++;
2190                 free = (unsigned long) n - (unsigned long)(p+1);
2191         }
2192         printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
2193         if ((relative = eldest_child(p)))
2194                 printk("%5d ", relative->pid);
2195         else
2196                 printk("      ");
2197         if ((relative = younger_sibling(p)))
2198                 printk("%7d", relative->pid);
2199         else
2200                 printk("       ");
2201         if ((relative = older_sibling(p)))
2202                 printk(" %5d", relative->pid);
2203         else
2204                 printk("      ");
2205         if (!p->mm)
2206                 printk(" (L-TLB)\n");
2207         else
2208                 printk(" (NOTLB)\n");
2209
2210         show_stack(p, NULL);
2211 }
2212
2213 void show_state(void)
2214 {
2215         task_t *g, *p;
2216
2217 #if (BITS_PER_LONG == 32)
2218         printk("\n"
2219                "                         free                        sibling\n");
2220         printk("  task             PC    stack   pid father child younger older\n");
2221 #else
2222         printk("\n"
2223                "                                 free                        sibling\n");
2224         printk("  task                 PC        stack   pid father child younger older\n");
2225 #endif
2226         read_lock(&tasklist_lock);
2227         do_each_thread(g, p) {
2228                 /*
2229                  * reset the NMI-timeout, listing all files on a slow
2230                  * console might take alot of time:
2231                  */
2232                 touch_nmi_watchdog();
2233                 show_task(p);
2234         } while_each_thread(g, p);
2235
2236         read_unlock(&tasklist_lock);
2237 }
2238
2239 void __init init_idle(task_t *idle, int cpu)
2240 {
2241         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
2242         unsigned long flags;
2243
2244         local_irq_save(flags);
2245         double_rq_lock(idle_rq, rq);
2246
2247         idle_rq->curr = idle_rq->idle = idle;
2248         deactivate_task(idle, rq);
2249         idle->array = NULL;
2250         idle->prio = MAX_PRIO;
2251         idle->state = TASK_RUNNING;
2252         set_task_cpu(idle, cpu);
2253         double_rq_unlock(idle_rq, rq);
2254         set_tsk_need_resched(idle);
2255         local_irq_restore(flags);
2256
2257         /* Set the preempt count _outside_ the spinlocks! */
2258 #ifdef CONFIG_PREEMPT
2259         idle->thread_info->preempt_count = (idle->lock_depth >= 0);
2260 #else
2261         idle->thread_info->preempt_count = 0;
2262 #endif
2263 }
2264
2265 #ifdef CONFIG_SMP
2266 /*
2267  * This is how migration works:
2268  *
2269  * 1) we queue a migration_req_t structure in the source CPU's
2270  *    runqueue and wake up that CPU's migration thread.
2271  * 2) we down() the locked semaphore => thread blocks.
2272  * 3) migration thread wakes up (implicitly it forces the migrated
2273  *    thread off the CPU)
2274  * 4) it gets the migration request and checks whether the migrated
2275  *    task is still in the wrong runqueue.
2276  * 5) if it's in the wrong runqueue then the migration thread removes
2277  *    it and puts it into the right queue.
2278  * 6) migration thread up()s the semaphore.
2279  * 7) we wake up and the migration is done.
2280  */
2281
2282 typedef struct {
2283         struct list_head list;
2284         task_t *task;
2285         struct completion done;
2286 } migration_req_t;
2287
2288 /*
2289  * Change a given task's CPU affinity. Migrate the thread to a
2290  * proper CPU and schedule it away if the CPU it's executing on
2291  * is removed from the allowed bitmask.
2292  *
2293  * NOTE: the caller must have a valid reference to the task, the
2294  * task must not exit() & deallocate itself prematurely.  The
2295  * call is not atomic; no spinlocks may be held.
2296  */
2297 int set_cpus_allowed(task_t *p, unsigned long new_mask)
2298 {
2299         unsigned long flags;
2300         migration_req_t req;
2301         runqueue_t *rq;
2302
2303         if (any_online_cpu(new_mask) == NR_CPUS)
2304                 return -EINVAL;
2305
2306         rq = task_rq_lock(p, &flags);
2307         p->cpus_allowed = new_mask;
2308         /*
2309          * Can the task run on the task's current CPU? If not then
2310          * migrate the thread off to a proper CPU.
2311          */
2312         if (new_mask & (1UL << task_cpu(p))) {
2313                 task_rq_unlock(rq, &flags);
2314                 return 0;
2315         }
2316         /*
2317          * If the task is not on a runqueue (and not running), then
2318          * it is sufficient to simply update the task's cpu field.
2319          */
2320         if (!p->array && !task_running(rq, p)) {
2321                 set_task_cpu(p, any_online_cpu(p->cpus_allowed));
2322                 task_rq_unlock(rq, &flags);
2323                 return 0;
2324         }
2325         init_completion(&req.done);
2326         req.task = p;
2327         list_add(&req.list, &rq->migration_queue);
2328         task_rq_unlock(rq, &flags);
2329
2330         wake_up_process(rq->migration_thread);
2331
2332         wait_for_completion(&req.done);
2333         return 0;
2334 }
2335
2336 /* Move (not current) task off this cpu, onto dest cpu. */
2337 static void move_task_away(struct task_struct *p, int dest_cpu)
2338 {
2339         runqueue_t *rq_dest;
2340         unsigned long flags;
2341
2342         rq_dest = cpu_rq(dest_cpu);
2343
2344         local_irq_save(flags);
2345         double_rq_lock(this_rq(), rq_dest);
2346         if (task_cpu(p) != smp_processor_id())
2347                 goto out; /* Already moved */
2348
2349         set_task_cpu(p, dest_cpu);
2350         if (p->array) {
2351                 deactivate_task(p, this_rq());
2352                 activate_task(p, rq_dest);
2353                 if (p->prio < rq_dest->curr->prio)
2354                         resched_task(rq_dest->curr);
2355         }
2356  out:
2357         double_rq_unlock(this_rq(), rq_dest);
2358         local_irq_restore(flags);
2359 }
2360
2361 /*
2362  * migration_thread - this is a highprio system thread that performs
2363  * thread migration by bumping thread off CPU then 'pushing' onto
2364  * another runqueue.
2365  */
2366 static int migration_thread(void * data)
2367 {
2368         /* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */
2369         struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 };
2370         int cpu = (long) data;
2371         runqueue_t *rq;
2372         int ret;
2373
2374         daemonize("migration/%d", cpu);
2375         set_fs(KERNEL_DS);
2376
2377         /*
2378          * Either we are running on the right CPU, or there's a a
2379          * migration thread on this CPU, guaranteed (we're started
2380          * serially).
2381          */
2382         set_cpus_allowed(current, 1UL << cpu);
2383
2384         ret = setscheduler(0, SCHED_FIFO, &param);
2385
2386         rq = this_rq();
2387         rq->migration_thread = current;
2388
2389         for (;;) {
2390                 struct list_head *head;
2391                 migration_req_t *req;
2392
2393                 spin_lock_irq(&rq->lock);
2394                 head = &rq->migration_queue;
2395                 current->state = TASK_INTERRUPTIBLE;
2396                 if (list_empty(head)) {
2397                         spin_unlock_irq(&rq->lock);
2398                         schedule();
2399                         continue;
2400                 }
2401                 req = list_entry(head->next, migration_req_t, list);
2402                 list_del_init(head->next);
2403                 spin_unlock_irq(&rq->lock);
2404
2405                 move_task_away(req->task,
2406                                any_online_cpu(req->task->cpus_allowed));
2407                 complete(&req->done);
2408         }
2409 }
2410
2411 /*
2412  * migration_call - callback that gets triggered when a CPU is added.
2413  * Here we can start up the necessary migration thread for the new CPU.
2414  */
2415 static int migration_call(struct notifier_block *nfb,
2416                           unsigned long action,
2417                           void *hcpu)
2418 {
2419         switch (action) {
2420         case CPU_ONLINE:
2421                 printk("Starting migration thread for cpu %li\n",
2422                        (long)hcpu);
2423                 kernel_thread(migration_thread, hcpu, CLONE_KERNEL);
2424                 while (!cpu_rq((long)hcpu)->migration_thread)
2425                         yield();
2426                 break;
2427         }
2428         return NOTIFY_OK;
2429 }
2430
2431 static struct notifier_block migration_notifier = { &migration_call, NULL, 0 };
2432
2433 __init int migration_init(void)
2434 {
2435         /* Start one for boot CPU. */
2436         migration_call(&migration_notifier, CPU_ONLINE,
2437                        (void *)(long)smp_processor_id());
2438         register_cpu_notifier(&migration_notifier);
2439         return 0;
2440 }
2441
2442 #endif
2443
2444 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2445 /*
2446  * The 'big kernel lock'
2447  *
2448  * This spinlock is taken and released recursively by lock_kernel()
2449  * and unlock_kernel().  It is transparently dropped and reaquired
2450  * over schedule().  It is used to protect legacy code that hasn't
2451  * been migrated to a proper locking design yet.
2452  *
2453  * Don't use in new code.
2454  */
2455 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
2456 #endif
2457
2458 static void kstat_init_cpu(int cpu)
2459 {
2460         /* Add any initialisation to kstat here */
2461         /* Useful when cpu offlining logic is added.. */
2462 }
2463
2464 static int __devinit kstat_cpu_notify(struct notifier_block *self,
2465                                         unsigned long action, void *hcpu)
2466 {
2467         int cpu = (unsigned long)hcpu;
2468         switch(action) {
2469         case CPU_UP_PREPARE:
2470                 kstat_init_cpu(cpu);
2471                 break;
2472         default:
2473                 break;
2474         }
2475         return NOTIFY_OK;
2476 }
2477
2478 static struct notifier_block __devinitdata kstat_nb = {
2479         .notifier_call  = kstat_cpu_notify,
2480         .next           = NULL,
2481 };
2482
2483 __init static void init_kstat(void) {
2484         kstat_cpu_notify(&kstat_nb, (unsigned long)CPU_UP_PREPARE,
2485                         (void *)(long)smp_processor_id());
2486         register_cpu_notifier(&kstat_nb);
2487 }
2488
2489 void __init sched_init(void)
2490 {
2491         runqueue_t *rq;
2492         int i, j, k;
2493
2494         /* Init the kstat counters */
2495         init_kstat();
2496         for (i = 0; i < NR_CPUS; i++) {
2497                 prio_array_t *array;
2498
2499                 rq = cpu_rq(i);
2500                 rq->active = rq->arrays;
2501                 rq->expired = rq->arrays + 1;
2502                 spin_lock_init(&rq->lock);
2503                 INIT_LIST_HEAD(&rq->migration_queue);
2504                 atomic_set(&rq->nr_iowait, 0);
2505                 nr_running_init(rq);
2506
2507                 for (j = 0; j < 2; j++) {
2508                         array = rq->arrays + j;
2509                         for (k = 0; k < MAX_PRIO; k++) {
2510                                 INIT_LIST_HEAD(array->queue + k);
2511                                 __clear_bit(k, array->bitmap);
2512                         }
2513                         // delimiter for bitsearch
2514                         __set_bit(MAX_PRIO, array->bitmap);
2515                 }
2516         }
2517         /*
2518          * We have to do a little magic to get the first
2519          * thread right in SMP mode.
2520          */
2521         rq = this_rq();
2522         rq->curr = current;
2523         rq->idle = current;
2524         set_task_cpu(current, smp_processor_id());
2525         wake_up_forked_process(current);
2526
2527         init_timers();
2528
2529         /*
2530          * The boot idle thread does lazy MMU switching as well:
2531          */
2532         atomic_inc(&init_mm.mm_count);
2533         enter_lazy_tlb(&init_mm, current);
2534 }
2535
2536 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
2537 void __might_sleep(char *file, int line)
2538 {
2539 #if defined(in_atomic)
2540         static unsigned long prev_jiffy;        /* ratelimiting */
2541
2542         if (in_atomic() || irqs_disabled()) {
2543                 if (time_before(jiffies, prev_jiffy + HZ))
2544                         return;
2545                 prev_jiffy = jiffies;
2546                 printk(KERN_ERR "Debug: sleeping function called from invalid"
2547                                 " context at %s:%d\n", file, line);
2548                 dump_stack();
2549         }
2550 #endif
2551 }
2552 #endif
2553
2554
2555 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
2556 /*
2557  * This could be a long-held lock.  If another CPU holds it for a long time,
2558  * and that CPU is not asked to reschedule then *this* CPU will spin on the
2559  * lock for a long time, even if *this* CPU is asked to reschedule.
2560  *
2561  * So what we do here, in the slow (contended) path is to spin on the lock by
2562  * hand while permitting preemption.
2563  *
2564  * Called inside preempt_disable().
2565  */
2566 void __preempt_spin_lock(spinlock_t *lock)
2567 {
2568         if (preempt_count() > 1) {
2569                 _raw_spin_lock(lock);
2570                 return;
2571         }
2572         do {
2573                 preempt_enable();
2574                 while (spin_is_locked(lock))
2575                         cpu_relax();
2576                 preempt_disable();
2577         } while (!_raw_spin_trylock(lock));
2578 }
2579
2580 void __preempt_write_lock(rwlock_t *lock)
2581 {
2582         if (preempt_count() > 1) {
2583                 _raw_write_lock(lock);
2584                 return;
2585         }
2586
2587         do {
2588                 preempt_enable();
2589                 while (rwlock_is_locked(lock))
2590                         cpu_relax();
2591                 preempt_disable();
2592         } while (!_raw_write_trylock(lock));
2593 }
2594 #endif