kernel/sched.c

   1 /*
   2  *  linux/kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991, 1992  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
  13  */
  14
  15 /*
  16  * 'sched.c' is the main kernel file. It contains scheduling primitives
  17  * (sleep_on, wakeup, schedule etc) as well as a number of simple system
  18  * call functions (type getpid()), which just extract a field from
  19  * current-task
  20  */
  21
  22 #include <linux/config.h>
  23 #include <linux/mm.h>
  24 #include <linux/init.h>
  25 #include <linux/smp_lock.h>
  26 #include <linux/interrupt.h>
  27 #include <linux/kernel_stat.h>
  28
  29 #include <asm/uaccess.h>
  30 #include <asm/mmu_context.h>
  31
  32 extern void timer_bh(void);
  33 extern void tqueue_bh(void);
  34 extern void immediate_bh(void);
  35
  36 /*
  37  * scheduler variables
  38  */
  39
  40 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
  41
  42 extern void mem_use(void);
  43
  44 /*
  45  * Scheduling quanta.
  46  *
  47  * NOTE! The unix "nice" value influences how long a process
  48  * gets. The nice value ranges from -20 to +19, where a -20
  49  * is a "high-priority" task, and a "+10" is a low-priority
  50  * task.
  51  *
  52  * We want the time-slice to be around 50ms or so, so this
  53  * calculation depends on the value of HZ.
  54  */
  55 #if HZ < 200
  56 #define TICK_SCALE(x)   ((x) >> 2)
  57 #elif HZ < 400
  58 #define TICK_SCALE(x)   ((x) >> 1)
  59 #elif HZ < 800
  60 #define TICK_SCALE(x)   (x)
  61 #elif HZ < 1600
  62 #define TICK_SCALE(x)   ((x) << 1)
  63 #else
  64 #define TICK_SCALE(x)   ((x) << 2)
  65 #endif
  66
  67 #define NICE_TO_TICKS(nice)     (TICK_SCALE(20-(nice))+1)
  68
  69
  70 /*
  71  *      Init task must be ok at boot for the ix86 as we will check its signals
  72  *      via the SMP irq return path.
  73  */
  74
  75 struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
  76
  77 /*
  78  * The tasklist_lock protects the linked list of processes.
  79  *
  80  * The runqueue_lock locks the parts that actually access
  81  * and change the run-queues, and have to be interrupt-safe.
  82  *
  83  * If both locks are to be concurrently held, the runqueue_lock
  84  * nests inside the tasklist_lock.
  85  */
  86 spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
  87 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
  88
  89 static LIST_HEAD(runqueue_head);
  90
  91 /*
  92  * We align per-CPU scheduling data on cacheline boundaries,
  93  * to prevent cacheline ping-pong.
  94  */
  95 static union {
  96         struct schedule_data {
  97                 struct task_struct * curr;
  98                 cycles_t last_schedule;
  99         } schedule_data;
 100         char __pad [SMP_CACHE_BYTES];
 101 } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
 102
 103 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
 104 #define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
 105
 106 struct kernel_stat kstat;
 107
 108 #ifdef CONFIG_SMP
 109
 110 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
 111 #define can_schedule(p,cpu) ((!(p)->has_cpu) && \
 112                                 ((p)->cpus_allowed & (1 << cpu)))
 113
 114 #else
 115
 116 #define idle_task(cpu) (&init_task)
 117 #define can_schedule(p,cpu) (1)
 118
 119 #endif
 120
 121 void scheduling_functions_start_here(void) { }
 122
 123 /*
 124  * This is the function that decides how desirable a process is..
 125  * You can weigh different processes against each other depending
 126  * on what CPU they've run on lately etc to try to handle cache
 127  * and TLB miss penalties.
 128  *
 129  * Return values:
 130  *       -1000: never select this
 131  *           0: out of time, recalculate counters (but it might still be
 132  *              selected)
 133  *         +ve: "goodness" value (the larger, the better)
 134  *       +1000: realtime process, select this.
 135  */
 136
 137 static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
 138 {
 139         int weight;
 140
 141         /*
 142          * select the current process after every other
 143          * runnable process, but before the idle thread.
 144          * Also, dont trigger a counter recalculation.
 145          */
 146         weight = -1;
 147         if (p->policy & SCHED_YIELD)
 148                 goto out;
 149
 150         /*
 151          * Non-RT process - normal case first.
 152          */
 153         if (p->policy == SCHED_OTHER) {
 154                 /*
 155                  * Give the process a first-approximation goodness value
 156                  * according to the number of clock-ticks it has left.
 157                  *
 158                  * Don't do any other calculations if the time slice is
 159                  * over..
 160                  */
 161                 weight = p->counter;
 162                 if (!weight)
 163                         goto out;
 164
 165 #ifdef CONFIG_SMP
 166                 /* Give a largish advantage to the same processor...   */
 167                 /* (this is equivalent to penalizing other processors) */
 168                 if (p->processor == this_cpu)
 169                         weight += PROC_CHANGE_PENALTY;
 170 #endif
 171
 172                 /* .. and a slight advantage to the current MM */
 173                 if (p->mm == this_mm || !p->mm)
 174                         weight += 1;
 175                 weight += 20 - p->nice;
 176                 goto out;
 177         }
 178
 179         /*
 180          * Realtime process, select the first one on the
 181          * runqueue (taking priorities within processes
 182          * into account).
 183          */
 184         weight = 1000 + p->rt_priority;
 185 out:
 186         return weight;
 187 }
 188
 189 /*
 190  * the 'goodness value' of replacing a process on a given CPU.
 191  * positive value means 'replace', zero or negative means 'dont'.
 192  */
 193 static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
 194 {
 195         return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
 196 }
 197
 198 /*
 199  * This is ugly, but reschedule_idle() is very timing-critical.
 200  * We are called with the runqueue spinlock held and we must
 201  * not claim the tasklist_lock.
 202  */
 203 static FASTCALL(void reschedule_idle(struct task_struct * p));
 204
 205 static void reschedule_idle(struct task_struct * p)
 206 {
 207 #ifdef CONFIG_SMP
 208         int this_cpu = smp_processor_id();
 209         struct task_struct *tsk, *target_tsk;
 210         int cpu, best_cpu, i, max_prio;
 211         cycles_t oldest_idle;
 212
 213         /*
 214          * shortcut if the woken up task's last CPU is
 215          * idle now.
 216          */
 217         best_cpu = p->processor;
 218         if (can_schedule(p, best_cpu)) {
 219                 tsk = idle_task(best_cpu);
 220                 if (cpu_curr(best_cpu) == tsk) {
 221                         int need_resched;
 222 send_now_idle:
 223                         /*
 224                          * If need_resched == -1 then we can skip sending
 225                          * the IPI altogether, tsk->need_resched is
 226                          * actively watched by the idle thread.
 227                          */
 228                         need_resched = tsk->need_resched;
 229                         tsk->need_resched = 1;
 230                         if ((best_cpu != this_cpu) && !need_resched)
 231                                 smp_send_reschedule(best_cpu);
 232                         return;
 233                 }
 234         }
 235
 236         /*
 237          * We know that the preferred CPU has a cache-affine current
 238          * process, lets try to find a new idle CPU for the woken-up
 239          * process. Select the least recently active idle CPU. (that
 240          * one will have the least active cache context.) Also find
 241          * the executing process which has the least priority.
 242          */
 243         oldest_idle = (cycles_t) -1;
 244         target_tsk = NULL;
 245         max_prio = 1;
 246
 247         for (i = 0; i < smp_num_cpus; i++) {
 248                 cpu = cpu_logical_map(i);
 249                 if (!can_schedule(p, cpu))
 250                         continue;
 251                 tsk = cpu_curr(cpu);
 252                 /*
 253                  * We use the first available idle CPU. This creates
 254                  * a priority list between idle CPUs, but this is not
 255                  * a problem.
 256                  */
 257                 if (tsk == idle_task(cpu)) {
 258                         if (last_schedule(cpu) < oldest_idle) {
 259                                 oldest_idle = last_schedule(cpu);
 260                                 target_tsk = tsk;
 261                         }
 262                 } else {
 263                         if (oldest_idle == -1ULL) {
 264                                 int prio = preemption_goodness(tsk, p, cpu);
 265
 266                                 if (prio > max_prio) {
 267                                         max_prio = prio;
 268                                         target_tsk = tsk;
 269                                 }
 270                         }
 271                 }
 272         }
 273         tsk = target_tsk;
 274         if (tsk) {
 275                 if (oldest_idle != -1ULL) {
 276                         best_cpu = tsk->processor;
 277                         goto send_now_idle;
 278                 }
 279                 tsk->need_resched = 1;
 280                 if (tsk->processor != this_cpu)
 281                         smp_send_reschedule(tsk->processor);
 282         }
 283         return;
 284
 285
 286 #else /* UP */
 287         int this_cpu = smp_processor_id();
 288         struct task_struct *tsk;
 289
 290         tsk = cpu_curr(this_cpu);
 291         if (preemption_goodness(tsk, p, this_cpu) > 1)
 292                 tsk->need_resched = 1;
 293 #endif
 294 }
 295
 296 /*
 297  * Careful!
 298  *
 299  * This has to add the process to the _beginning_ of the
 300  * run-queue, not the end. See the comment about "This is
 301  * subtle" in the scheduler proper..
 302  */
 303 static inline void add_to_runqueue(struct task_struct * p)
 304 {
 305         list_add(&p->run_list, &runqueue_head);
 306         nr_running++;
 307 }
 308
 309 static inline void move_last_runqueue(struct task_struct * p)
 310 {
 311         list_del(&p->run_list);
 312         list_add_tail(&p->run_list, &runqueue_head);
 313 }
 314
 315 static inline void move_first_runqueue(struct task_struct * p)
 316 {
 317         list_del(&p->run_list);
 318         list_add(&p->run_list, &runqueue_head);
 319 }
 320
 321 /*
 322  * Wake up a process. Put it on the run-queue if it's not
 323  * already there.  The "current" process is always on the
 324  * run-queue (except when the actual re-schedule is in
 325  * progress), and as such you're allowed to do the simpler
 326  * "current->state = TASK_RUNNING" to mark yourself runnable
 327  * without the overhead of this.
 328  */
 329 inline void wake_up_process(struct task_struct * p)
 330 {
 331         unsigned long flags;
 332
 333         /*
 334          * We want the common case fall through straight, thus the goto.
 335          */
 336         spin_lock_irqsave(&runqueue_lock, flags);
 337         p->state = TASK_RUNNING;
 338         if (task_on_runqueue(p))
 339                 goto out;
 340         add_to_runqueue(p);
 341         reschedule_idle(p);
 342 out:
 343         spin_unlock_irqrestore(&runqueue_lock, flags);
 344 }
 345
 346 static inline void wake_up_process_synchronous(struct task_struct * p)
 347 {
 348         unsigned long flags;
 349
 350         /*
 351          * We want the common case fall through straight, thus the goto.
 352          */
 353         spin_lock_irqsave(&runqueue_lock, flags);
 354         p->state = TASK_RUNNING;
 355         if (task_on_runqueue(p))
 356                 goto out;
 357         add_to_runqueue(p);
 358 out:
 359         spin_unlock_irqrestore(&runqueue_lock, flags);
 360 }
 361
 362 static void process_timeout(unsigned long __data)
 363 {
 364         struct task_struct * p = (struct task_struct *) __data;
 365
 366         wake_up_process(p);
 367 }
 368
 369 signed long schedule_timeout(signed long timeout)
 370 {
 371         struct timer_list timer;
 372         unsigned long expire;
 373
 374         switch (timeout)
 375         {
 376         case MAX_SCHEDULE_TIMEOUT:
 377                 /*
 378                  * These two special cases are useful to be comfortable
 379                  * in the caller. Nothing more. We could take
 380                  * MAX_SCHEDULE_TIMEOUT from one of the negative value
 381                  * but I' d like to return a valid offset (>=0) to allow
 382                  * the caller to do everything it want with the retval.
 383                  */
 384                 schedule();
 385                 goto out;
 386         default:
 387                 /*
 388                  * Another bit of PARANOID. Note that the retval will be
 389                  * 0 since no piece of kernel is supposed to do a check
 390                  * for a negative retval of schedule_timeout() (since it
 391                  * should never happens anyway). You just have the printk()
 392                  * that will tell you if something is gone wrong and where.
 393                  */
 394                 if (timeout < 0)
 395                 {
 396                         printk(KERN_ERR "schedule_timeout: wrong timeout "
 397                                "value %lx from %p\n", timeout,
 398                                __builtin_return_address(0));
 399                         current->state = TASK_RUNNING;
 400                         goto out;
 401                 }
 402         }
 403
 404         expire = timeout + jiffies;
 405
 406         init_timer(&timer);
 407         timer.expires = expire;
 408         timer.data = (unsigned long) current;
 409         timer.function = process_timeout;
 410
 411         add_timer(&timer);
 412         schedule();
 413         del_timer_sync(&timer);
 414
 415         timeout = expire - jiffies;
 416
 417  out:
 418         return timeout < 0 ? 0 : timeout;
 419 }
 420
 421 /*
 422  * schedule_tail() is getting called from the fork return path. This
 423  * cleans up all remaining scheduler things, without impacting the
 424  * common case.
 425  */
 426 static inline void __schedule_tail(struct task_struct *prev)
 427 {
 428 #ifdef CONFIG_SMP
 429         int policy;
 430
 431         /*
 432          * prev->policy can be written from here only before `prev'
 433          * can be scheduled (before setting prev->has_cpu to zero).
 434          * Of course it must also be read before allowing prev
 435          * to be rescheduled, but since the write depends on the read
 436          * to complete, wmb() is enough. (the spin_lock() acquired
 437          * before setting has_cpu is not enough because the spin_lock()
 438          * common code semantics allows code outside the critical section
 439          * to enter inside the critical section)
 440          */
 441         policy = prev->policy;
 442         prev->policy = policy & ~SCHED_YIELD;
 443         wmb();
 444
 445         /*
 446          * fast path falls through. We have to clear has_cpu before
 447          * checking prev->state to avoid a wakeup race - thus we
 448          * also have to protect against the task exiting early.
 449          */
 450         task_lock(prev);
 451         prev->has_cpu = 0;
 452         mb();
 453         if (prev->state == TASK_RUNNING)
 454                 goto needs_resched;
 455
 456 out_unlock:
 457         task_unlock(prev);      /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
 458         return;
 459
 460         /*
 461          * Slow path - we 'push' the previous process and
 462          * reschedule_idle() will attempt to find a new
 463          * processor for it. (but it might preempt the
 464          * current process as well.) We must take the runqueue
 465          * lock and re-check prev->state to be correct. It might
 466          * still happen that this process has a preemption
 467          * 'in progress' already - but this is not a problem and
 468          * might happen in other circumstances as well.
 469          */
 470 needs_resched:
 471         {
 472                 unsigned long flags;
 473
 474                 /*
 475                  * Avoid taking the runqueue lock in cases where
 476                  * no preemption-check is necessery:
 477                  */
 478                 if ((prev == idle_task(smp_processor_id())) ||
 479                                                 (policy & SCHED_YIELD))
 480                         goto out_unlock;
 481
 482                 spin_lock_irqsave(&runqueue_lock, flags);
 483                 if (prev->state == TASK_RUNNING)
 484                         reschedule_idle(prev);
 485                 spin_unlock_irqrestore(&runqueue_lock, flags);
 486                 goto out_unlock;
 487         }
 488 #else
 489         prev->policy &= ~SCHED_YIELD;
 490 #endif /* CONFIG_SMP */
 491 }
 492
 493 void schedule_tail(struct task_struct *prev)
 494 {
 495         __schedule_tail(prev);
 496 }
 497
 498 /*
 499  *  'schedule()' is the scheduler function. It's a very simple and nice
 500  * scheduler: it's not perfect, but certainly works for most things.
 501  *
 502  * The goto is "interesting".
 503  *
 504  *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 505  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 506  * information in task[0] is never used.
 507  */
 508 asmlinkage void schedule(void)
 509 {
 510         struct schedule_data * sched_data;
 511         struct task_struct *prev, *next, *p;
 512         struct list_head *tmp;
 513         int this_cpu, c;
 514
 515         if (!current->active_mm) BUG();
 516         if (tq_scheduler)
 517                 goto handle_tq_scheduler;
 518 tq_scheduler_back:
 519
 520         prev = current;
 521         this_cpu = prev->processor;
 522
 523         if (in_interrupt())
 524                 goto scheduling_in_interrupt;
 525
 526         release_kernel_lock(prev, this_cpu);
 527
 528         /* Do "administrative" work here while we don't hold any locks */
 529         if (softirq_active(this_cpu) & softirq_mask(this_cpu))
 530                 goto handle_softirq;
 531 handle_softirq_back:
 532
 533         /*
 534          * 'sched_data' is protected by the fact that we can run
 535          * only one process per CPU.
 536          */
 537         sched_data = & aligned_data[this_cpu].schedule_data;
 538
 539         spin_lock_irq(&runqueue_lock);
 540
 541         /* move an exhausted RR process to be last.. */
 542         if (prev->policy == SCHED_RR)
 543                 goto move_rr_last;
 544 move_rr_back:
 545
 546         switch (prev->state) {
 547                 case TASK_INTERRUPTIBLE:
 548                         if (signal_pending(prev)) {
 549                                 prev->state = TASK_RUNNING;
 550                                 break;
 551                         }
 552                 default:
 553                         del_from_runqueue(prev);
 554                 case TASK_RUNNING:
 555         }
 556         prev->need_resched = 0;
 557
 558         /*
 559          * this is the scheduler proper:
 560          */
 561
 562 repeat_schedule:
 563         /*
 564          * Default process to select..
 565          */
 566         next = idle_task(this_cpu);
 567         c = -1000;
 568         if (prev->state == TASK_RUNNING)
 569                 goto still_running;
 570
 571 still_running_back:
 572         list_for_each(tmp, &runqueue_head) {
 573                 p = list_entry(tmp, struct task_struct, run_list);
 574                 if (can_schedule(p, this_cpu)) {
 575                         int weight = goodness(p, this_cpu, prev->active_mm);
 576                         if (weight > c)
 577                                 c = weight, next = p;
 578                 }
 579         }
 580
 581         /* Do we need to re-calculate counters? */
 582         if (!c)
 583                 goto recalculate;
 584         /*
 585          * from this point on nothing can prevent us from
 586          * switching to the next task, save this fact in
 587          * sched_data.
 588          */
 589         sched_data->curr = next;
 590 #ifdef CONFIG_SMP
 591         next->has_cpu = 1;
 592         next->processor = this_cpu;
 593 #endif
 594         spin_unlock_irq(&runqueue_lock);
 595
 596         if (prev == next)
 597                 goto same_process;
 598
 599 #ifdef CONFIG_SMP
 600         /*
 601          * maintain the per-process 'last schedule' value.
 602          * (this has to be recalculated even if we reschedule to
 603          * the same process) Currently this is only used on SMP,
 604          * and it's approximate, so we do not have to maintain
 605          * it while holding the runqueue spinlock.
 606          */
 607         sched_data->last_schedule = get_cycles();
 608
 609         /*
 610          * We drop the scheduler lock early (it's a global spinlock),
 611          * thus we have to lock the previous process from getting
 612          * rescheduled during switch_to().
 613          */
 614
 615 #endif /* CONFIG_SMP */
 616
 617         kstat.context_swtch++;
 618         /*
 619          * there are 3 processes which are affected by a context switch:
 620          *
 621          * prev == .... ==> (last => next)
 622          *
 623          * It's the 'much more previous' 'prev' that is on next's stack,
 624          * but prev is set to (the just run) 'last' process by switch_to().
 625          * This might sound slightly confusing but makes tons of sense.
 626          */
 627         prepare_to_switch();
 628         {
 629                 struct mm_struct *mm = next->mm;
 630                 struct mm_struct *oldmm = prev->active_mm;
 631                 if (!mm) {
 632                         if (next->active_mm) BUG();
 633                         next->active_mm = oldmm;
 634                         atomic_inc(&oldmm->mm_count);
 635                         enter_lazy_tlb(oldmm, next, this_cpu);
 636                 } else {
 637                         if (next->active_mm != mm) BUG();
 638                         switch_mm(oldmm, mm, next, this_cpu);
 639                 }
 640
 641                 if (!prev->mm) {
 642                         prev->active_mm = NULL;
 643                         mmdrop(oldmm);
 644                 }
 645         }
 646
 647         /*
 648          * This just switches the register state and the
 649          * stack.
 650          */
 651         switch_to(prev, next, prev);
 652         __schedule_tail(prev);
 653
 654 same_process:
 655         reacquire_kernel_lock(current);
 656         if (current->need_resched)
 657                 goto tq_scheduler_back;
 658
 659         return;
 660
 661 recalculate:
 662         {
 663                 struct task_struct *p;
 664                 spin_unlock_irq(&runqueue_lock);
 665                 read_lock(&tasklist_lock);
 666                 for_each_task(p)
 667                         p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
 668                 read_unlock(&tasklist_lock);
 669                 spin_lock_irq(&runqueue_lock);
 670         }
 671         goto repeat_schedule;
 672
 673 still_running:
 674         c = goodness(prev, this_cpu, prev->active_mm);
 675         next = prev;
 676         goto still_running_back;
 677
 678 handle_softirq:
 679         do_softirq();
 680         goto handle_softirq_back;
 681
 682 handle_tq_scheduler:
 683         /*
 684          * do not run the task queue with disabled interrupts,
 685          * cli() wouldn't work on SMP
 686          */
 687         sti();
 688         run_task_queue(&tq_scheduler);
 689         goto tq_scheduler_back;
 690
 691 move_rr_last:
 692         if (!prev->counter) {
 693                 prev->counter = NICE_TO_TICKS(prev->nice);
 694                 move_last_runqueue(prev);
 695         }
 696         goto move_rr_back;
 697
 698 scheduling_in_interrupt:
 699         printk("Scheduling in interrupt\n");
 700         BUG();
 701         return;
 702 }
 703
 704 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
 705                                      unsigned int wq_mode, const int sync)
 706 {
 707         struct list_head *tmp, *head;
 708         struct task_struct *p, *best_exclusive;
 709         unsigned long flags;
 710         int best_cpu, irq;
 711
 712         if (!q)
 713                 goto out;
 714
 715         best_cpu = smp_processor_id();
 716         irq = in_interrupt();
 717         best_exclusive = NULL;
 718         wq_write_lock_irqsave(&q->lock, flags);
 719
 720 #if WAITQUEUE_DEBUG
 721         CHECK_MAGIC_WQHEAD(q);
 722 #endif
 723
 724         head = &q->task_list;
 725 #if WAITQUEUE_DEBUG
 726         if (!head->next || !head->prev)
 727                 WQ_BUG();
 728 #endif
 729         tmp = head->next;
 730         while (tmp != head) {
 731                 unsigned int state;
 732                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
 733
 734                 tmp = tmp->next;
 735
 736 #if WAITQUEUE_DEBUG
 737                 CHECK_MAGIC(curr->__magic);
 738 #endif
 739                 p = curr->task;
 740                 state = p->state;
 741                 if (state & mode) {
 742 #if WAITQUEUE_DEBUG
 743                         curr->__waker = (long)__builtin_return_address(0);
 744 #endif
 745                         /*
 746                          * If waking up from an interrupt context then
 747                          * prefer processes which are affine to this
 748                          * CPU.
 749                          */
 750                         if (irq && (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)) {
 751                                 if (!best_exclusive)
 752                                         best_exclusive = p;
 753                                 if (p->processor == best_cpu) {
 754                                         best_exclusive = p;
 755                                         break;
 756                                 }
 757                         } else {
 758                                 if (sync)
 759                                         wake_up_process_synchronous(p);
 760                                 else
 761                                         wake_up_process(p);
 762                                 if (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)
 763                                         break;
 764                         }
 765                 }
 766         }
 767         if (best_exclusive) {
 768                 if (sync)
 769                         wake_up_process_synchronous(best_exclusive);
 770                 else
 771                         wake_up_process(best_exclusive);
 772         }
 773         wq_write_unlock_irqrestore(&q->lock, flags);
 774 out:
 775         return;
 776 }
 777
 778 void __wake_up(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
 779 {
 780         __wake_up_common(q, mode, wq_mode, 0);
 781 }
 782
 783 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
 784 {
 785         __wake_up_common(q, mode, wq_mode, 1);
 786 }
 787
 788 #define SLEEP_ON_VAR                            \
 789         unsigned long flags;                    \
 790         wait_queue_t wait;                      \
 791         init_waitqueue_entry(&wait, current);
 792
 793 #define SLEEP_ON_HEAD                                   \
 794         wq_write_lock_irqsave(&q->lock,flags);          \
 795         __add_wait_queue(q, &wait);                     \
 796         wq_write_unlock(&q->lock);
 797
 798 #define SLEEP_ON_TAIL                                           \
 799         wq_write_lock_irq(&q->lock);                            \
 800         __remove_wait_queue(q, &wait);                          \
 801         wq_write_unlock_irqrestore(&q->lock,flags);
 802
 803 void interruptible_sleep_on(wait_queue_head_t *q)
 804 {
 805         SLEEP_ON_VAR
 806
 807         current->state = TASK_INTERRUPTIBLE;
 808
 809         SLEEP_ON_HEAD
 810         schedule();
 811         SLEEP_ON_TAIL
 812 }
 813
 814 long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 815 {
 816         SLEEP_ON_VAR
 817
 818         current->state = TASK_INTERRUPTIBLE;
 819
 820         SLEEP_ON_HEAD
 821         timeout = schedule_timeout(timeout);
 822         SLEEP_ON_TAIL
 823
 824         return timeout;
 825 }
 826
 827 void sleep_on(wait_queue_head_t *q)
 828 {
 829         SLEEP_ON_VAR
 830
 831         current->state = TASK_UNINTERRUPTIBLE;
 832
 833         SLEEP_ON_HEAD
 834         schedule();
 835         SLEEP_ON_TAIL
 836 }
 837
 838 long sleep_on_timeout(wait_queue_head_t *q, long timeout)
 839 {
 840         SLEEP_ON_VAR
 841
 842         current->state = TASK_UNINTERRUPTIBLE;
 843
 844         SLEEP_ON_HEAD
 845         timeout = schedule_timeout(timeout);
 846         SLEEP_ON_TAIL
 847
 848         return timeout;
 849 }
 850
 851 void scheduling_functions_end_here(void) { }
 852
 853 #ifndef __alpha__
 854
 855 /*
 856  * This has been replaced by sys_setpriority.  Maybe it should be
 857  * moved into the arch dependent tree for those ports that require
 858  * it for backward compatibility?
 859  */
 860
 861 asmlinkage long sys_nice(int increment)
 862 {
 863         long newprio;
 864
 865         /*
 866          *      Setpriority might change our priority at the same moment.
 867          *      We don't have to worry. Conceptually one call occurs first
 868          *      and we have a single winner.
 869          */
 870         if (increment < 0) {
 871                 if (!capable(CAP_SYS_NICE))
 872                         return -EPERM;
 873                 if (increment < -40)
 874                         increment = -40;
 875         }
 876         if (increment > 40)
 877                 increment = 40;
 878
 879         newprio = current->nice + increment;
 880         if (newprio < -20)
 881                 newprio = -20;
 882         if (newprio > 19)
 883                 newprio = 19;
 884         current->nice = newprio;
 885         return 0;
 886 }
 887
 888 #endif
 889
 890 static inline struct task_struct *find_process_by_pid(pid_t pid)
 891 {
 892         struct task_struct *tsk = current;
 893
 894         if (pid)
 895                 tsk = find_task_by_pid(pid);
 896         return tsk;
 897 }
 898
 899 static int setscheduler(pid_t pid, int policy,
 900                         struct sched_param *param)
 901 {
 902         struct sched_param lp;
 903         struct task_struct *p;
 904         int retval;
 905
 906         retval = -EINVAL;
 907         if (!param || pid < 0)
 908                 goto out_nounlock;
 909
 910         retval = -EFAULT;
 911         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
 912                 goto out_nounlock;
 913
 914         /*
 915          * We play safe to avoid deadlocks.
 916          */
 917         read_lock_irq(&tasklist_lock);
 918         spin_lock(&runqueue_lock);
 919
 920         p = find_process_by_pid(pid);
 921
 922         retval = -ESRCH;
 923         if (!p)
 924                 goto out_unlock;
 925
 926         if (policy < 0)
 927                 policy = p->policy;
 928         else {
 929                 retval = -EINVAL;
 930                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
 931                                 policy != SCHED_OTHER)
 932                         goto out_unlock;
 933         }
 934
 935         /*
 936          * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
 937          * priority for SCHED_OTHER is 0.
 938          */
 939         retval = -EINVAL;
 940         if (lp.sched_priority < 0 || lp.sched_priority > 99)
 941                 goto out_unlock;
 942         if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
 943                 goto out_unlock;
 944
 945         retval = -EPERM;
 946         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
 947             !capable(CAP_SYS_NICE))
 948                 goto out_unlock;
 949         if ((current->euid != p->euid) && (current->euid != p->uid) &&
 950             !capable(CAP_SYS_NICE))
 951                 goto out_unlock;
 952
 953         retval = 0;
 954         p->policy = policy;
 955         p->rt_priority = lp.sched_priority;
 956         if (task_on_runqueue(p))
 957                 move_first_runqueue(p);
 958
 959         current->need_resched = 1;
 960
 961 out_unlock:
 962         spin_unlock(&runqueue_lock);
 963         read_unlock_irq(&tasklist_lock);
 964
 965 out_nounlock:
 966         return retval;
 967 }
 968
 969 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 970                                       struct sched_param *param)
 971 {
 972         return setscheduler(pid, policy, param);
 973 }
 974
 975 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param)
 976 {
 977         return setscheduler(pid, -1, param);
 978 }
 979
 980 asmlinkage long sys_sched_getscheduler(pid_t pid)
 981 {
 982         struct task_struct *p;
 983         int retval;
 984
 985         retval = -EINVAL;
 986         if (pid < 0)
 987                 goto out_nounlock;
 988
 989         retval = -ESRCH;
 990         read_lock(&tasklist_lock);
 991         p = find_process_by_pid(pid);
 992         if (p)
 993                 retval = p->policy & ~SCHED_YIELD;
 994         read_unlock(&tasklist_lock);
 995
 996 out_nounlock:
 997         return retval;
 998 }
 999
1000 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
1001 {
1002         struct task_struct *p;
1003         struct sched_param lp;
1004         int retval;
1005
1006         retval = -EINVAL;
1007         if (!param || pid < 0)
1008                 goto out_nounlock;
1009
1010         read_lock(&tasklist_lock);
1011         p = find_process_by_pid(pid);
1012         retval = -ESRCH;
1013         if (!p)
1014                 goto out_unlock;
1015         lp.sched_priority = p->rt_priority;
1016         read_unlock(&tasklist_lock);
1017
1018         /*
1019          * This one might sleep, we cannot do it with a spinlock held ...
1020          */
1021         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1022
1023 out_nounlock:
1024         return retval;
1025
1026 out_unlock:
1027         read_unlock(&tasklist_lock);
1028         return retval;
1029 }
1030
1031 asmlinkage long sys_sched_yield(void)
1032 {
1033         /*
1034          * Trick. sched_yield() first counts the number of truly
1035          * 'pending' runnable processes, then returns if it's
1036          * only the current processes. (This test does not have
1037          * to be atomic.) In threaded applications this optimization
1038          * gets triggered quite often.
1039          */
1040
1041         int nr_pending = nr_running;
1042
1043 #if CONFIG_SMP
1044         int i;
1045
1046         // Substract non-idle processes running on other CPUs.
1047         for (i = 0; i < smp_num_cpus; i++)
1048                 if (aligned_data[i].schedule_data.curr != idle_task(i))
1049                         nr_pending--;
1050 #else
1051         // on UP this process is on the runqueue as well
1052         nr_pending--;
1053 #endif
1054         if (nr_pending) {
1055                 /*
1056                  * This process can only be rescheduled by us,
1057                  * so this is safe without any locking.
1058                  */
1059                 if (current->policy == SCHED_OTHER)
1060                         current->policy |= SCHED_YIELD;
1061                 current->need_resched = 1;
1062         }
1063         return 0;
1064 }
1065
1066 asmlinkage long sys_sched_get_priority_max(int policy)
1067 {
1068         int ret = -EINVAL;
1069
1070         switch (policy) {
1071         case SCHED_FIFO:
1072         case SCHED_RR:
1073                 ret = 99;
1074                 break;
1075         case SCHED_OTHER:
1076                 ret = 0;
1077                 break;
1078         }
1079         return ret;
1080 }
1081
1082 asmlinkage long sys_sched_get_priority_min(int policy)
1083 {
1084         int ret = -EINVAL;
1085
1086         switch (policy) {
1087         case SCHED_FIFO:
1088         case SCHED_RR:
1089                 ret = 1;
1090                 break;
1091         case SCHED_OTHER:
1092                 ret = 0;
1093         }
1094         return ret;
1095 }
1096
1097 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1098 {
1099         struct timespec t;
1100         struct task_struct *p;
1101         int retval = -EINVAL;
1102
1103         if (pid < 0)
1104                 goto out_nounlock;
1105
1106         retval = -ESRCH;
1107         read_lock(&tasklist_lock);
1108         p = find_process_by_pid(pid);
1109         if (p)
1110                 jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
1111                                     &t);
1112         read_unlock(&tasklist_lock);
1113         if (p)
1114                 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1115 out_nounlock:
1116         return retval;
1117 }
1118
1119 static void show_task(struct task_struct * p)
1120 {
1121         unsigned long free = 0;
1122         int state;
1123         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1124
1125         printk("%-8s  ", p->comm);
1126         state = p->state ? ffz(~p->state) + 1 : 0;
1127         if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
1128                 printk(stat_nam[state]);
1129         else
1130                 printk(" ");
1131 #if (BITS_PER_LONG == 32)
1132         if (p == current)
1133                 printk(" current  ");
1134         else
1135                 printk(" %08lX ", thread_saved_pc(&p->thread));
1136 #else
1137         if (p == current)
1138                 printk("   current task   ");
1139         else
1140                 printk(" %016lx ", thread_saved_pc(&p->thread));
1141 #endif
1142         {
1143                 unsigned long * n = (unsigned long *) (p+1);
1144                 while (!*n)
1145                         n++;
1146                 free = (unsigned long) n - (unsigned long)(p+1);
1147         }
1148         printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
1149         if (p->p_cptr)
1150                 printk("%5d ", p->p_cptr->pid);
1151         else
1152                 printk("      ");
1153         if (!p->mm)
1154                 printk(" (L-TLB) ");
1155         else
1156                 printk(" (NOTLB) ");
1157         if (p->p_ysptr)
1158                 printk("%7d", p->p_ysptr->pid);
1159         else
1160                 printk("       ");
1161         if (p->p_osptr)
1162                 printk(" %5d\n", p->p_osptr->pid);
1163         else
1164                 printk("\n");
1165
1166         {
1167                 struct sigqueue *q;
1168                 char s[sizeof(sigset_t)*2+1], b[sizeof(sigset_t)*2+1];
1169
1170                 render_sigset_t(&p->pending.signal, s);
1171                 render_sigset_t(&p->blocked, b);
1172                 printk("   sig: %d %s %s :", signal_pending(p), s, b);
1173                 for (q = p->pending.head; q ; q = q->next)
1174                         printk(" %d", q->info.si_signo);
1175                 printk(" X\n");
1176         }
1177 }
1178
1179 char * render_sigset_t(sigset_t *set, char *buffer)
1180 {
1181         int i = _NSIG, x;
1182         do {
1183                 i -= 4, x = 0;
1184                 if (sigismember(set, i+1)) x |= 1;
1185                 if (sigismember(set, i+2)) x |= 2;
1186                 if (sigismember(set, i+3)) x |= 4;
1187                 if (sigismember(set, i+4)) x |= 8;
1188                 *buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
1189         } while (i >= 4);
1190         *buffer = 0;
1191         return buffer;
1192 }
1193
1194 void show_state(void)
1195 {
1196         struct task_struct *p;
1197
1198 #if (BITS_PER_LONG == 32)
1199         printk("\n"
1200                "                         free                        sibling\n");
1201         printk("  task             PC    stack   pid father child younger older\n");
1202 #else
1203         printk("\n"
1204                "                                 free                        sibling\n");
1205         printk("  task                 PC        stack   pid father child younger older\n");
1206 #endif
1207         read_lock(&tasklist_lock);
1208         for_each_task(p)
1209                 show_task(p);
1210         read_unlock(&tasklist_lock);
1211 }
1212
1213 /*
1214  *      Put all the gunge required to become a kernel thread without
1215  *      attached user resources in one place where it belongs.
1216  */
1217
1218 void daemonize(void)
1219 {
1220         struct fs_struct *fs;
1221
1222
1223         /*
1224          * If we were started as result of loading a module, close all of the
1225          * user space pages.  We don't need them, and if we didn't close them
1226          * they would be locked into memory.
1227          */
1228         exit_mm(current);
1229
1230         current->session = 1;
1231         current->pgrp = 1;
1232
1233         /* Become as one with the init task */
1234
1235         exit_fs(current);       /* current->fs->count--; */
1236         fs = init_task.fs;
1237         current->fs = fs;
1238         atomic_inc(&fs->count);
1239         exit_files(current);
1240         current->files = init_task.files;
1241         atomic_inc(&current->files->count);
1242 }
1243
1244 void __init init_idle(void)
1245 {
1246         struct schedule_data * sched_data;
1247         sched_data = &aligned_data[smp_processor_id()].schedule_data;
1248
1249         if (current != &init_task && task_on_runqueue(current)) {
1250                 printk("UGH! (%d:%d) was on the runqueue, removing.\n",
1251                         smp_processor_id(), current->pid);
1252                 del_from_runqueue(current);
1253         }
1254         sched_data->curr = current;
1255         sched_data->last_schedule = get_cycles();
1256 }
1257
1258 extern void init_timervecs (void);
1259
1260 void __init sched_init(void)
1261 {
1262         /*
1263          * We have to do a little magic to get the first
1264          * process right in SMP mode.
1265          */
1266         int cpu = smp_processor_id();
1267         int nr;
1268
1269         init_task.processor = cpu;
1270
1271         for(nr = 0; nr < PIDHASH_SZ; nr++)
1272                 pidhash[nr] = NULL;
1273
1274         init_timervecs();
1275
1276         init_bh(TIMER_BH, timer_bh);
1277         init_bh(TQUEUE_BH, tqueue_bh);
1278         init_bh(IMMEDIATE_BH, immediate_bh);
1279
1280         /*
1281          * The boot idle thread does lazy MMU switching as well:
1282          */
1283         atomic_inc(&init_mm.mm_count);
1284         enter_lazy_tlb(&init_mm, current, cpu);
1285 }