kernel/sched.c

   1 /*
   2  *  linux/kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991, 1992  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
  13  */
  14
  15 /*
  16  * 'sched.c' is the main kernel file. It contains scheduling primitives
  17  * (sleep_on, wakeup, schedule etc) as well as a number of simple system
  18  * call functions (type getpid()), which just extract a field from
  19  * current-task
  20  */
  21
  22 #include <linux/config.h>
  23 #include <linux/mm.h>
  24 #include <linux/init.h>
  25 #include <linux/smp_lock.h>
  26 #include <linux/interrupt.h>
  27 #include <linux/kernel_stat.h>
  28
  29 #include <asm/uaccess.h>
  30 #include <asm/mmu_context.h>
  31
  32 extern void timer_bh(void);
  33 extern void tqueue_bh(void);
  34 extern void immediate_bh(void);
  35
  36 /*
  37  * scheduler variables
  38  */
  39
  40 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
  41
  42 extern void mem_use(void);
  43
  44 /*
  45  * Scheduling quanta.
  46  *
  47  * NOTE! The unix "nice" value influences how long a process
  48  * gets. The nice value ranges from -20 to +19, where a -20
  49  * is a "high-priority" task, and a "+10" is a low-priority
  50  * task.
  51  *
  52  * We want the time-slice to be around 50ms or so, so this
  53  * calculation depends on the value of HZ.
  54  */
  55 #if HZ < 200
  56 #define TICK_SCALE(x)   ((x) >> 2)
  57 #elif HZ < 400
  58 #define TICK_SCALE(x)   ((x) >> 1)
  59 #elif HZ < 800
  60 #define TICK_SCALE(x)   (x)
  61 #elif HZ < 1600
  62 #define TICK_SCALE(x)   ((x) << 1)
  63 #else
  64 #define TICK_SCALE(x)   ((x) << 2)
  65 #endif
  66
  67 #define NICE_TO_TICKS(nice)     (TICK_SCALE(20-(nice))+1)
  68
  69
  70 /*
  71  *      Init task must be ok at boot for the ix86 as we will check its signals
  72  *      via the SMP irq return path.
  73  */
  74
  75 struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
  76
  77 /*
  78  * The tasklist_lock protects the linked list of processes.
  79  *
  80  * The scheduler lock is protecting against multiple entry
  81  * into the scheduling code, and doesn't need to worry
  82  * about interrupts (because interrupts cannot call the
  83  * scheduler).
  84  *
  85  * The run-queue lock locks the parts that actually access
  86  * and change the run-queues, and have to be interrupt-safe.
  87  */
  88 spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* second */
  89 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* third */
  90
  91 static LIST_HEAD(runqueue_head);
  92
  93 /*
  94  * We align per-CPU scheduling data on cacheline boundaries,
  95  * to prevent cacheline ping-pong.
  96  */
  97 static union {
  98         struct schedule_data {
  99                 struct task_struct * curr;
 100                 cycles_t last_schedule;
 101         } schedule_data;
 102         char __pad [SMP_CACHE_BYTES];
 103 } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
 104
 105 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
 106 #define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
 107
 108 struct kernel_stat kstat;
 109
 110 #ifdef CONFIG_SMP
 111
 112 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
 113 #define can_schedule(p,cpu) ((!(p)->has_cpu) && \
 114                                 ((p)->cpus_allowed & (1 << cpu)))
 115
 116 #else
 117
 118 #define idle_task(cpu) (&init_task)
 119 #define can_schedule(p,cpu) (1)
 120
 121 #endif
 122
 123 void scheduling_functions_start_here(void) { }
 124
 125 /*
 126  * This is the function that decides how desirable a process is..
 127  * You can weigh different processes against each other depending
 128  * on what CPU they've run on lately etc to try to handle cache
 129  * and TLB miss penalties.
 130  *
 131  * Return values:
 132  *       -1000: never select this
 133  *           0: out of time, recalculate counters (but it might still be
 134  *              selected)
 135  *         +ve: "goodness" value (the larger, the better)
 136  *       +1000: realtime process, select this.
 137  */
 138
 139 static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
 140 {
 141         int weight;
 142
 143         /*
 144          * select the current process after every other
 145          * runnable process, but before the idle thread.
 146          * Also, dont trigger a counter recalculation.
 147          */
 148         weight = -1;
 149         if (p->policy & SCHED_YIELD)
 150                 goto out;
 151
 152         /*
 153          * Non-RT process - normal case first.
 154          */
 155         if (p->policy == SCHED_OTHER) {
 156                 /*
 157                  * Give the process a first-approximation goodness value
 158                  * according to the number of clock-ticks it has left.
 159                  *
 160                  * Don't do any other calculations if the time slice is
 161                  * over..
 162                  */
 163                 weight = p->counter;
 164                 if (!weight)
 165                         goto out;
 166
 167 #ifdef CONFIG_SMP
 168                 /* Give a largish advantage to the same processor...   */
 169                 /* (this is equivalent to penalizing other processors) */
 170                 if (p->processor == this_cpu)
 171                         weight += PROC_CHANGE_PENALTY;
 172 #endif
 173
 174                 /* .. and a slight advantage to the current MM */
 175                 if (p->mm == this_mm || !p->mm)
 176                         weight += 1;
 177                 weight += 20 - p->nice;
 178                 goto out;
 179         }
 180
 181         /*
 182          * Realtime process, select the first one on the
 183          * runqueue (taking priorities within processes
 184          * into account).
 185          */
 186         weight = 1000 + p->rt_priority;
 187 out:
 188         return weight;
 189 }
 190
 191 /*
 192  * the 'goodness value' of replacing a process on a given CPU.
 193  * positive value means 'replace', zero or negative means 'dont'.
 194  */
 195 static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
 196 {
 197         return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
 198 }
 199
 200 /*
 201  * This is ugly, but reschedule_idle() is very timing-critical.
 202  * We enter with the runqueue spinlock held, but we might end
 203  * up unlocking it early, so the caller must not unlock the
 204  * runqueue, it's always done by reschedule_idle().
 205  *
 206  * This function must be inline as anything that saves and restores
 207  * flags has to do so within the same register window on sparc (Anton)
 208  */
 209 static FASTCALL(void reschedule_idle(struct task_struct * p));
 210
 211 static void reschedule_idle(struct task_struct * p)
 212 {
 213 #ifdef CONFIG_SMP
 214         int this_cpu = smp_processor_id();
 215         struct task_struct *tsk, *target_tsk;
 216         int cpu, best_cpu, i, max_prio;
 217         cycles_t oldest_idle;
 218
 219         /*
 220          * shortcut if the woken up task's last CPU is
 221          * idle now.
 222          */
 223         best_cpu = p->processor;
 224         if (can_schedule(p, best_cpu)) {
 225                 tsk = idle_task(best_cpu);
 226                 if (cpu_curr(best_cpu) == tsk) {
 227                         int need_resched;
 228 send_now_idle:
 229                         /*
 230                          * If need_resched == -1 then we can skip sending
 231                          * the IPI altogether, tsk->need_resched is
 232                          * actively watched by the idle thread.
 233                          */
 234                         need_resched = tsk->need_resched;
 235                         tsk->need_resched = 1;
 236                         if ((best_cpu != this_cpu) && !need_resched)
 237                                 smp_send_reschedule(best_cpu);
 238                         return;
 239                 }
 240         }
 241
 242         /*
 243          * We know that the preferred CPU has a cache-affine current
 244          * process, lets try to find a new idle CPU for the woken-up
 245          * process. Select the least recently active idle CPU. (that
 246          * one will have the least active cache context.) Also find
 247          * the executing process which has the least priority.
 248          */
 249         oldest_idle = (cycles_t) -1;
 250         target_tsk = NULL;
 251         max_prio = 1;
 252
 253         for (i = 0; i < smp_num_cpus; i++) {
 254                 cpu = cpu_logical_map(i);
 255                 if (!can_schedule(p, cpu))
 256                         continue;
 257                 tsk = cpu_curr(cpu);
 258                 /*
 259                  * We use the first available idle CPU. This creates
 260                  * a priority list between idle CPUs, but this is not
 261                  * a problem.
 262                  */
 263                 if (tsk == idle_task(cpu)) {
 264                         if (last_schedule(cpu) < oldest_idle) {
 265                                 oldest_idle = last_schedule(cpu);
 266                                 target_tsk = tsk;
 267                         }
 268                 } else {
 269                         if (oldest_idle == -1ULL) {
 270                                 int prio = preemption_goodness(tsk, p, cpu);
 271
 272                                 if (prio > max_prio) {
 273                                         max_prio = prio;
 274                                         target_tsk = tsk;
 275                                 }
 276                         }
 277                 }
 278         }
 279         tsk = target_tsk;
 280         if (tsk) {
 281                 if (oldest_idle != -1ULL)
 282                         goto send_now_idle;
 283                 tsk->need_resched = 1;
 284                 if (tsk->processor != this_cpu)
 285                         smp_send_reschedule(tsk->processor);
 286         }
 287         return;
 288
 289
 290 #else /* UP */
 291         int this_cpu = smp_processor_id();
 292         struct task_struct *tsk;
 293
 294         tsk = cpu_curr(this_cpu);
 295         if (preemption_goodness(tsk, p, this_cpu) > 1)
 296                 tsk->need_resched = 1;
 297 #endif
 298 }
 299
 300 /*
 301  * Careful!
 302  *
 303  * This has to add the process to the _beginning_ of the
 304  * run-queue, not the end. See the comment about "This is
 305  * subtle" in the scheduler proper..
 306  */
 307 static inline void add_to_runqueue(struct task_struct * p)
 308 {
 309         list_add(&p->run_list, &runqueue_head);
 310         nr_running++;
 311 }
 312
 313 static inline void move_last_runqueue(struct task_struct * p)
 314 {
 315         list_del(&p->run_list);
 316         list_add_tail(&p->run_list, &runqueue_head);
 317 }
 318
 319 static inline void move_first_runqueue(struct task_struct * p)
 320 {
 321         list_del(&p->run_list);
 322         list_add(&p->run_list, &runqueue_head);
 323 }
 324
 325 /*
 326  * Wake up a process. Put it on the run-queue if it's not
 327  * already there.  The "current" process is always on the
 328  * run-queue (except when the actual re-schedule is in
 329  * progress), and as such you're allowed to do the simpler
 330  * "current->state = TASK_RUNNING" to mark yourself runnable
 331  * without the overhead of this.
 332  */
 333 inline void wake_up_process(struct task_struct * p)
 334 {
 335         unsigned long flags;
 336
 337         /*
 338          * We want the common case fall through straight, thus the goto.
 339          */
 340         spin_lock_irqsave(&runqueue_lock, flags);
 341         p->state = TASK_RUNNING;
 342         if (task_on_runqueue(p))
 343                 goto out;
 344         add_to_runqueue(p);
 345         reschedule_idle(p);
 346 out:
 347         spin_unlock_irqrestore(&runqueue_lock, flags);
 348 }
 349
 350 static inline void wake_up_process_synchronous(struct task_struct * p)
 351 {
 352         unsigned long flags;
 353
 354         /*
 355          * We want the common case fall through straight, thus the goto.
 356          */
 357         spin_lock_irqsave(&runqueue_lock, flags);
 358         p->state = TASK_RUNNING;
 359         if (task_on_runqueue(p))
 360                 goto out;
 361         add_to_runqueue(p);
 362 out:
 363         spin_unlock_irqrestore(&runqueue_lock, flags);
 364 }
 365
 366 static void process_timeout(unsigned long __data)
 367 {
 368         struct task_struct * p = (struct task_struct *) __data;
 369
 370         wake_up_process(p);
 371 }
 372
 373 signed long schedule_timeout(signed long timeout)
 374 {
 375         struct timer_list timer;
 376         unsigned long expire;
 377
 378         switch (timeout)
 379         {
 380         case MAX_SCHEDULE_TIMEOUT:
 381                 /*
 382                  * These two special cases are useful to be comfortable
 383                  * in the caller. Nothing more. We could take
 384                  * MAX_SCHEDULE_TIMEOUT from one of the negative value
 385                  * but I' d like to return a valid offset (>=0) to allow
 386                  * the caller to do everything it want with the retval.
 387                  */
 388                 schedule();
 389                 goto out;
 390         default:
 391                 /*
 392                  * Another bit of PARANOID. Note that the retval will be
 393                  * 0 since no piece of kernel is supposed to do a check
 394                  * for a negative retval of schedule_timeout() (since it
 395                  * should never happens anyway). You just have the printk()
 396                  * that will tell you if something is gone wrong and where.
 397                  */
 398                 if (timeout < 0)
 399                 {
 400                         printk(KERN_ERR "schedule_timeout: wrong timeout "
 401                                "value %lx from %p\n", timeout,
 402                                __builtin_return_address(0));
 403                         current->state = TASK_RUNNING;
 404                         goto out;
 405                 }
 406         }
 407
 408         expire = timeout + jiffies;
 409
 410         init_timer(&timer);
 411         timer.expires = expire;
 412         timer.data = (unsigned long) current;
 413         timer.function = process_timeout;
 414
 415         add_timer(&timer);
 416         schedule();
 417         del_timer_sync(&timer);
 418
 419         timeout = expire - jiffies;
 420
 421  out:
 422         return timeout < 0 ? 0 : timeout;
 423 }
 424
 425 /*
 426  * schedule_tail() is getting called from the fork return path. This
 427  * cleans up all remaining scheduler things, without impacting the
 428  * common case.
 429  */
 430 static inline void __schedule_tail(struct task_struct *prev)
 431 {
 432 #ifdef CONFIG_SMP
 433         int policy;
 434
 435         /*
 436          * fast path falls through. We have to clear has_cpu before
 437          * checking prev->state to avoid a wakeup race - thus we
 438          * also have to protect against the task exiting early.
 439          */
 440         task_lock(prev);
 441         policy = prev->policy;
 442         prev->policy = policy & ~SCHED_YIELD;
 443         prev->has_cpu = 0;
 444         wmb();
 445         if (prev->state == TASK_RUNNING)
 446                 goto needs_resched;
 447
 448 out_unlock:
 449         task_unlock(prev);
 450         return;
 451
 452         /*
 453          * Slow path - we 'push' the previous process and
 454          * reschedule_idle() will attempt to find a new
 455          * processor for it. (but it might preempt the
 456          * current process as well.) We must take the runqueue
 457          * lock and re-check prev->state to be correct. It might
 458          * still happen that this process has a preemption
 459          * 'in progress' already - but this is not a problem and
 460          * might happen in other circumstances as well.
 461          */
 462 needs_resched:
 463         {
 464                 unsigned long flags;
 465
 466                 /*
 467                  * Avoid taking the runqueue lock in cases where
 468                  * no preemption-check is necessery:
 469                  */
 470                 if ((prev == idle_task(smp_processor_id())) ||
 471                                                 (policy & SCHED_YIELD))
 472                         goto out_unlock;
 473
 474                 spin_lock_irqsave(&runqueue_lock, flags);
 475                 if (prev->state == TASK_RUNNING)
 476                         reschedule_idle(prev);
 477                 spin_unlock_irqrestore(&runqueue_lock, flags);
 478                 goto out_unlock;
 479         }
 480 #else
 481         prev->policy &= ~SCHED_YIELD;
 482 #endif /* CONFIG_SMP */
 483 }
 484
 485 void schedule_tail(struct task_struct *prev)
 486 {
 487         __schedule_tail(prev);
 488 }
 489
 490 /*
 491  *  'schedule()' is the scheduler function. It's a very simple and nice
 492  * scheduler: it's not perfect, but certainly works for most things.
 493  *
 494  * The goto is "interesting".
 495  *
 496  *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 497  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 498  * information in task[0] is never used.
 499  */
 500 asmlinkage void schedule(void)
 501 {
 502         struct schedule_data * sched_data;
 503         struct task_struct *prev, *next, *p;
 504         struct list_head *tmp;
 505         int this_cpu, c;
 506
 507         if (!current->active_mm) BUG();
 508         if (tq_scheduler)
 509                 goto handle_tq_scheduler;
 510 tq_scheduler_back:
 511
 512         prev = current;
 513         this_cpu = prev->processor;
 514
 515         if (in_interrupt())
 516                 goto scheduling_in_interrupt;
 517
 518         release_kernel_lock(prev, this_cpu);
 519
 520         /* Do "administrative" work here while we don't hold any locks */
 521         if (softirq_active(this_cpu) & softirq_mask(this_cpu))
 522                 goto handle_softirq;
 523 handle_softirq_back:
 524
 525         /*
 526          * 'sched_data' is protected by the fact that we can run
 527          * only one process per CPU.
 528          */
 529         sched_data = & aligned_data[this_cpu].schedule_data;
 530
 531         spin_lock_irq(&runqueue_lock);
 532
 533         /* move an exhausted RR process to be last.. */
 534         if (prev->policy == SCHED_RR)
 535                 goto move_rr_last;
 536 move_rr_back:
 537
 538         switch (prev->state & ~TASK_EXCLUSIVE) {
 539                 case TASK_INTERRUPTIBLE:
 540                         if (signal_pending(prev)) {
 541                                 prev->state = TASK_RUNNING;
 542                                 break;
 543                         }
 544                 default:
 545                         del_from_runqueue(prev);
 546                 case TASK_RUNNING:
 547         }
 548         prev->need_resched = 0;
 549
 550         /*
 551          * this is the scheduler proper:
 552          */
 553
 554 repeat_schedule:
 555         /*
 556          * Default process to select..
 557          */
 558         next = idle_task(this_cpu);
 559         c = -1000;
 560         if (prev->state == TASK_RUNNING)
 561                 goto still_running;
 562
 563 still_running_back:
 564         list_for_each(tmp, &runqueue_head) {
 565                 p = list_entry(tmp, struct task_struct, run_list);
 566                 if (can_schedule(p, this_cpu)) {
 567                         int weight = goodness(p, this_cpu, prev->active_mm);
 568                         if (weight > c)
 569                                 c = weight, next = p;
 570                 }
 571         }
 572
 573         /* Do we need to re-calculate counters? */
 574         if (!c)
 575                 goto recalculate;
 576         /*
 577          * from this point on nothing can prevent us from
 578          * switching to the next task, save this fact in
 579          * sched_data.
 580          */
 581         sched_data->curr = next;
 582 #ifdef CONFIG_SMP
 583         next->has_cpu = 1;
 584         next->processor = this_cpu;
 585 #endif
 586         spin_unlock_irq(&runqueue_lock);
 587
 588         if (prev == next)
 589                 goto same_process;
 590
 591 #ifdef CONFIG_SMP
 592         /*
 593          * maintain the per-process 'last schedule' value.
 594          * (this has to be recalculated even if we reschedule to
 595          * the same process) Currently this is only used on SMP,
 596          * and it's approximate, so we do not have to maintain
 597          * it while holding the runqueue spinlock.
 598          */
 599         sched_data->last_schedule = get_cycles();
 600
 601         /*
 602          * We drop the scheduler lock early (it's a global spinlock),
 603          * thus we have to lock the previous process from getting
 604          * rescheduled during switch_to().
 605          */
 606
 607 #endif /* CONFIG_SMP */
 608
 609         kstat.context_swtch++;
 610         /*
 611          * there are 3 processes which are affected by a context switch:
 612          *
 613          * prev == .... ==> (last => next)
 614          *
 615          * It's the 'much more previous' 'prev' that is on next's stack,
 616          * but prev is set to (the just run) 'last' process by switch_to().
 617          * This might sound slightly confusing but makes tons of sense.
 618          */
 619         prepare_to_switch();
 620         {
 621                 struct mm_struct *mm = next->mm;
 622                 struct mm_struct *oldmm = prev->active_mm;
 623                 if (!mm) {
 624                         if (next->active_mm) BUG();
 625                         next->active_mm = oldmm;
 626                         atomic_inc(&oldmm->mm_count);
 627                         enter_lazy_tlb(oldmm, next, this_cpu);
 628                 } else {
 629                         if (next->active_mm != mm) BUG();
 630                         switch_mm(oldmm, mm, next, this_cpu);
 631                 }
 632
 633                 if (!prev->mm) {
 634                         prev->active_mm = NULL;
 635                         mmdrop(oldmm);
 636                 }
 637         }
 638
 639         /*
 640          * This just switches the register state and the
 641          * stack.
 642          */
 643         switch_to(prev, next, prev);
 644         __schedule_tail(prev);
 645
 646 same_process:
 647         reacquire_kernel_lock(current);
 648         if (current->need_resched)
 649                 goto tq_scheduler_back;
 650
 651         return;
 652
 653 recalculate:
 654         {
 655                 struct task_struct *p;
 656                 spin_unlock_irq(&runqueue_lock);
 657                 read_lock(&tasklist_lock);
 658                 for_each_task(p)
 659                         p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
 660                 read_unlock(&tasklist_lock);
 661                 spin_lock_irq(&runqueue_lock);
 662         }
 663         goto repeat_schedule;
 664
 665 still_running:
 666         c = goodness(prev, this_cpu, prev->active_mm);
 667         next = prev;
 668         goto still_running_back;
 669
 670 handle_softirq:
 671         do_softirq();
 672         goto handle_softirq_back;
 673
 674 handle_tq_scheduler:
 675         /*
 676          * do not run the task queue with disabled interrupts,
 677          * cli() wouldn't work on SMP
 678          */
 679         sti();
 680         run_task_queue(&tq_scheduler);
 681         goto tq_scheduler_back;
 682
 683 move_rr_last:
 684         if (!prev->counter) {
 685                 prev->counter = NICE_TO_TICKS(prev->nice);
 686                 move_last_runqueue(prev);
 687         }
 688         goto move_rr_back;
 689
 690 scheduling_in_interrupt:
 691         printk("Scheduling in interrupt\n");
 692         BUG();
 693         return;
 694 }
 695
 696 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
 697                                                 const int sync)
 698 {
 699         struct list_head *tmp, *head;
 700         struct task_struct *p, *best_exclusive;
 701         unsigned long flags;
 702         int best_cpu, irq;
 703
 704         if (!q || !waitqueue_active(q))
 705                 goto out;
 706
 707         best_cpu = smp_processor_id();
 708         irq = in_interrupt();
 709         best_exclusive = NULL;
 710         wq_write_lock_irqsave(&q->lock, flags);
 711
 712 #if WAITQUEUE_DEBUG
 713         CHECK_MAGIC_WQHEAD(q);
 714 #endif
 715
 716         head = &q->task_list;
 717 #if WAITQUEUE_DEBUG
 718         if (!head->next || !head->prev)
 719                 WQ_BUG();
 720 #endif
 721         tmp = head->next;
 722         while (tmp != head) {
 723                 unsigned int state;
 724                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
 725
 726                 tmp = tmp->next;
 727
 728 #if WAITQUEUE_DEBUG
 729                 CHECK_MAGIC(curr->__magic);
 730 #endif
 731                 p = curr->task;
 732                 state = p->state;
 733                 if (state & (mode & ~TASK_EXCLUSIVE)) {
 734 #if WAITQUEUE_DEBUG
 735                         curr->__waker = (long)__builtin_return_address(0);
 736 #endif
 737                         /*
 738                          * If waking up from an interrupt context then
 739                          * prefer processes which are affine to this
 740                          * CPU.
 741                          */
 742                         if (irq && (state & mode & TASK_EXCLUSIVE)) {
 743                                 if (!best_exclusive)
 744                                         best_exclusive = p;
 745                                 if (p->processor == best_cpu) {
 746                                         best_exclusive = p;
 747                                         break;
 748                                 }
 749                         } else {
 750                                 if (sync)
 751                                         wake_up_process_synchronous(p);
 752                                 else
 753                                         wake_up_process(p);
 754                                 if (state & mode & TASK_EXCLUSIVE)
 755                                         break;
 756                         }
 757                 }
 758         }
 759         if (best_exclusive) {
 760                 if (sync)
 761                         wake_up_process_synchronous(best_exclusive);
 762                 else
 763                         wake_up_process(best_exclusive);
 764         }
 765         wq_write_unlock_irqrestore(&q->lock, flags);
 766 out:
 767         return;
 768 }
 769
 770 void __wake_up(wait_queue_head_t *q, unsigned int mode)
 771 {
 772         __wake_up_common(q, mode, 0);
 773 }
 774
 775 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode)
 776 {
 777         __wake_up_common(q, mode, 1);
 778 }
 779
 780 #define SLEEP_ON_VAR                            \
 781         unsigned long flags;                    \
 782         wait_queue_t wait;                      \
 783         init_waitqueue_entry(&wait, current);
 784
 785 #define SLEEP_ON_HEAD                                   \
 786         wq_write_lock_irqsave(&q->lock,flags);          \
 787         __add_wait_queue(q, &wait);                     \
 788         wq_write_unlock(&q->lock);
 789
 790 #define SLEEP_ON_TAIL                                           \
 791         wq_write_lock_irq(&q->lock);                            \
 792         __remove_wait_queue(q, &wait);                          \
 793         wq_write_unlock_irqrestore(&q->lock,flags);
 794
 795 void interruptible_sleep_on(wait_queue_head_t *q)
 796 {
 797         SLEEP_ON_VAR
 798
 799         current->state = TASK_INTERRUPTIBLE;
 800
 801         SLEEP_ON_HEAD
 802         schedule();
 803         SLEEP_ON_TAIL
 804 }
 805
 806 long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 807 {
 808         SLEEP_ON_VAR
 809
 810         current->state = TASK_INTERRUPTIBLE;
 811
 812         SLEEP_ON_HEAD
 813         timeout = schedule_timeout(timeout);
 814         SLEEP_ON_TAIL
 815
 816         return timeout;
 817 }
 818
 819 void sleep_on(wait_queue_head_t *q)
 820 {
 821         SLEEP_ON_VAR
 822
 823         current->state = TASK_UNINTERRUPTIBLE;
 824
 825         SLEEP_ON_HEAD
 826         schedule();
 827         SLEEP_ON_TAIL
 828 }
 829
 830 long sleep_on_timeout(wait_queue_head_t *q, long timeout)
 831 {
 832         SLEEP_ON_VAR
 833
 834         current->state = TASK_UNINTERRUPTIBLE;
 835
 836         SLEEP_ON_HEAD
 837         timeout = schedule_timeout(timeout);
 838         SLEEP_ON_TAIL
 839
 840         return timeout;
 841 }
 842
 843 void scheduling_functions_end_here(void) { }
 844
 845 #ifndef __alpha__
 846
 847 /*
 848  * This has been replaced by sys_setpriority.  Maybe it should be
 849  * moved into the arch dependent tree for those ports that require
 850  * it for backward compatibility?
 851  */
 852
 853 asmlinkage long sys_nice(int increment)
 854 {
 855         long newprio;
 856
 857         /*
 858          *      Setpriority might change our priority at the same moment.
 859          *      We don't have to worry. Conceptually one call occurs first
 860          *      and we have a single winner.
 861          */
 862         if (increment < 0) {
 863                 if (!capable(CAP_SYS_NICE))
 864                         return -EPERM;
 865                 if (increment < -40)
 866                         increment = -40;
 867         }
 868         if (increment > 40)
 869                 increment = 40;
 870
 871         newprio = current->nice + increment;
 872         if (newprio < -20)
 873                 newprio = -20;
 874         if (newprio > 19)
 875                 newprio = 19;
 876         current->nice = newprio;
 877         return 0;
 878 }
 879
 880 #endif
 881
 882 static inline struct task_struct *find_process_by_pid(pid_t pid)
 883 {
 884         struct task_struct *tsk = current;
 885
 886         if (pid)
 887                 tsk = find_task_by_pid(pid);
 888         return tsk;
 889 }
 890
 891 static int setscheduler(pid_t pid, int policy,
 892                         struct sched_param *param)
 893 {
 894         struct sched_param lp;
 895         struct task_struct *p;
 896         int retval;
 897
 898         retval = -EINVAL;
 899         if (!param || pid < 0)
 900                 goto out_nounlock;
 901
 902         retval = -EFAULT;
 903         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
 904                 goto out_nounlock;
 905
 906         /*
 907          * We play safe to avoid deadlocks.
 908          */
 909         spin_lock_irq(&runqueue_lock);
 910         read_lock(&tasklist_lock);
 911
 912         p = find_process_by_pid(pid);
 913
 914         retval = -ESRCH;
 915         if (!p)
 916                 goto out_unlock;
 917
 918         if (policy < 0)
 919                 policy = p->policy;
 920         else {
 921                 retval = -EINVAL;
 922                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
 923                                 policy != SCHED_OTHER)
 924                         goto out_unlock;
 925         }
 926
 927         /*
 928          * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
 929          * priority for SCHED_OTHER is 0.
 930          */
 931         retval = -EINVAL;
 932         if (lp.sched_priority < 0 || lp.sched_priority > 99)
 933                 goto out_unlock;
 934         if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
 935                 goto out_unlock;
 936
 937         retval = -EPERM;
 938         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
 939             !capable(CAP_SYS_NICE))
 940                 goto out_unlock;
 941         if ((current->euid != p->euid) && (current->euid != p->uid) &&
 942             !capable(CAP_SYS_NICE))
 943                 goto out_unlock;
 944
 945         retval = 0;
 946         p->policy = policy;
 947         p->rt_priority = lp.sched_priority;
 948         if (task_on_runqueue(p))
 949                 move_first_runqueue(p);
 950
 951         current->need_resched = 1;
 952
 953 out_unlock:
 954         read_unlock(&tasklist_lock);
 955         spin_unlock_irq(&runqueue_lock);
 956
 957 out_nounlock:
 958         return retval;
 959 }
 960
 961 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 962                                       struct sched_param *param)
 963 {
 964         return setscheduler(pid, policy, param);
 965 }
 966
 967 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param)
 968 {
 969         return setscheduler(pid, -1, param);
 970 }
 971
 972 asmlinkage long sys_sched_getscheduler(pid_t pid)
 973 {
 974         struct task_struct *p;
 975         int retval;
 976
 977         retval = -EINVAL;
 978         if (pid < 0)
 979                 goto out_nounlock;
 980
 981         retval = -ESRCH;
 982         read_lock(&tasklist_lock);
 983         p = find_process_by_pid(pid);
 984         if (p)
 985                 retval = p->policy & ~SCHED_YIELD;
 986         read_unlock(&tasklist_lock);
 987
 988 out_nounlock:
 989         return retval;
 990 }
 991
 992 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
 993 {
 994         struct task_struct *p;
 995         struct sched_param lp;
 996         int retval;
 997
 998         retval = -EINVAL;
 999         if (!param || pid < 0)
1000                 goto out_nounlock;
1001
1002         read_lock(&tasklist_lock);
1003         p = find_process_by_pid(pid);
1004         retval = -ESRCH;
1005         if (!p)
1006                 goto out_unlock;
1007         lp.sched_priority = p->rt_priority;
1008         read_unlock(&tasklist_lock);
1009
1010         /*
1011          * This one might sleep, we cannot do it with a spinlock held ...
1012          */
1013         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1014
1015 out_nounlock:
1016         return retval;
1017
1018 out_unlock:
1019         read_unlock(&tasklist_lock);
1020         return retval;
1021 }
1022
1023 asmlinkage long sys_sched_yield(void)
1024 {
1025         /*
1026          * Trick. sched_yield() first counts the number of truly
1027          * 'pending' runnable processes, then returns if it's
1028          * only the current processes. (This test does not have
1029          * to be atomic.) In threaded applications this optimization
1030          * gets triggered quite often.
1031          */
1032
1033         int nr_pending = nr_running;
1034
1035 #if CONFIG_SMP
1036         int i;
1037
1038         // Substract non-idle processes running on other CPUs.
1039         for (i = 0; i < smp_num_cpus; i++)
1040                 if (aligned_data[i].schedule_data.curr != idle_task(i))
1041                         nr_pending--;
1042 #else
1043         // on UP this process is on the runqueue as well
1044         nr_pending--;
1045 #endif
1046         if (nr_pending) {
1047                 /*
1048                  * This process can only be rescheduled by us,
1049                  * so this is safe without any locking.
1050                  */
1051                 if (current->policy == SCHED_OTHER)
1052                         current->policy |= SCHED_YIELD;
1053                 current->need_resched = 1;
1054         }
1055         return 0;
1056 }
1057
1058 asmlinkage long sys_sched_get_priority_max(int policy)
1059 {
1060         int ret = -EINVAL;
1061
1062         switch (policy) {
1063         case SCHED_FIFO:
1064         case SCHED_RR:
1065                 ret = 99;
1066                 break;
1067         case SCHED_OTHER:
1068                 ret = 0;
1069                 break;
1070         }
1071         return ret;
1072 }
1073
1074 asmlinkage long sys_sched_get_priority_min(int policy)
1075 {
1076         int ret = -EINVAL;
1077
1078         switch (policy) {
1079         case SCHED_FIFO:
1080         case SCHED_RR:
1081                 ret = 1;
1082                 break;
1083         case SCHED_OTHER:
1084                 ret = 0;
1085         }
1086         return ret;
1087 }
1088
1089 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1090 {
1091         struct timespec t;
1092         struct task_struct *p;
1093         int retval = -EINVAL;
1094
1095         if (pid < 0)
1096                 goto out_nounlock;
1097
1098         retval = -ESRCH;
1099         read_lock(&tasklist_lock);
1100         p = find_process_by_pid(pid);
1101         if (p)
1102                 jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
1103                                     &t);
1104         read_unlock(&tasklist_lock);
1105         if (p)
1106                 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1107 out_nounlock:
1108         return retval;
1109 }
1110
1111 static void show_task(struct task_struct * p)
1112 {
1113         unsigned long free = 0;
1114         int state;
1115         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1116
1117         printk("%-8s  ", p->comm);
1118         state = p->state ? ffz(~p->state) + 1 : 0;
1119         if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
1120                 printk(stat_nam[state]);
1121         else
1122                 printk(" ");
1123 #if (BITS_PER_LONG == 32)
1124         if (p == current)
1125                 printk(" current  ");
1126         else
1127                 printk(" %08lX ", thread_saved_pc(&p->thread));
1128 #else
1129         if (p == current)
1130                 printk("   current task   ");
1131         else
1132                 printk(" %016lx ", thread_saved_pc(&p->thread));
1133 #endif
1134         {
1135                 unsigned long * n = (unsigned long *) (p+1);
1136                 while (!*n)
1137                         n++;
1138                 free = (unsigned long) n - (unsigned long)(p+1);
1139         }
1140         printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
1141         if (p->p_cptr)
1142                 printk("%5d ", p->p_cptr->pid);
1143         else
1144                 printk("      ");
1145         if (!p->mm)
1146                 printk(" (L-TLB) ");
1147         else
1148                 printk(" (NOTLB) ");
1149         if (p->p_ysptr)
1150                 printk("%7d", p->p_ysptr->pid);
1151         else
1152                 printk("       ");
1153         if (p->p_osptr)
1154                 printk(" %5d\n", p->p_osptr->pid);
1155         else
1156                 printk("\n");
1157
1158         {
1159                 struct sigqueue *q;
1160                 char s[sizeof(sigset_t)*2+1], b[sizeof(sigset_t)*2+1];
1161
1162                 render_sigset_t(&p->pending.signal, s);
1163                 render_sigset_t(&p->blocked, b);
1164                 printk("   sig: %d %s %s :", signal_pending(p), s, b);
1165                 for (q = p->pending.head; q ; q = q->next)
1166                         printk(" %d", q->info.si_signo);
1167                 printk(" X\n");
1168         }
1169 }
1170
1171 char * render_sigset_t(sigset_t *set, char *buffer)
1172 {
1173         int i = _NSIG, x;
1174         do {
1175                 i -= 4, x = 0;
1176                 if (sigismember(set, i+1)) x |= 1;
1177                 if (sigismember(set, i+2)) x |= 2;
1178                 if (sigismember(set, i+3)) x |= 4;
1179                 if (sigismember(set, i+4)) x |= 8;
1180                 *buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
1181         } while (i >= 4);
1182         *buffer = 0;
1183         return buffer;
1184 }
1185
1186 void show_state(void)
1187 {
1188         struct task_struct *p;
1189
1190 #if (BITS_PER_LONG == 32)
1191         printk("\n"
1192                "                         free                        sibling\n");
1193         printk("  task             PC    stack   pid father child younger older\n");
1194 #else
1195         printk("\n"
1196                "                                 free                        sibling\n");
1197         printk("  task                 PC        stack   pid father child younger older\n");
1198 #endif
1199         read_lock(&tasklist_lock);
1200         for_each_task(p)
1201                 show_task(p);
1202         read_unlock(&tasklist_lock);
1203 }
1204
1205 /*
1206  *      Put all the gunge required to become a kernel thread without
1207  *      attached user resources in one place where it belongs.
1208  */
1209
1210 void daemonize(void)
1211 {
1212         struct fs_struct *fs;
1213
1214
1215         /*
1216          * If we were started as result of loading a module, close all of the
1217          * user space pages.  We don't need them, and if we didn't close them
1218          * they would be locked into memory.
1219          */
1220         exit_mm(current);
1221
1222         current->session = 1;
1223         current->pgrp = 1;
1224
1225         /* Become as one with the init task */
1226
1227         exit_fs(current);       /* current->fs->count--; */
1228         fs = init_task.fs;
1229         current->fs = fs;
1230         atomic_inc(&fs->count);
1231         exit_files(current);
1232         current->files = init_task.files;
1233         atomic_inc(&current->files->count);
1234 }
1235
1236 void __init init_idle(void)
1237 {
1238         struct schedule_data * sched_data;
1239         sched_data = &aligned_data[smp_processor_id()].schedule_data;
1240
1241         if (current != &init_task && task_on_runqueue(current)) {
1242                 printk("UGH! (%d:%d) was on the runqueue, removing.\n",
1243                         smp_processor_id(), current->pid);
1244                 del_from_runqueue(current);
1245         }
1246         sched_data->curr = current;
1247         sched_data->last_schedule = get_cycles();
1248 }
1249
1250 extern void init_timervecs (void);
1251
1252 void __init sched_init(void)
1253 {
1254         /*
1255          * We have to do a little magic to get the first
1256          * process right in SMP mode.
1257          */
1258         int cpu = smp_processor_id();
1259         int nr;
1260
1261         init_task.processor = cpu;
1262
1263         for(nr = 0; nr < PIDHASH_SZ; nr++)
1264                 pidhash[nr] = NULL;
1265
1266         init_timervecs();
1267
1268         init_bh(TIMER_BH, timer_bh);
1269         init_bh(TQUEUE_BH, tqueue_bh);
1270         init_bh(IMMEDIATE_BH, immediate_bh);
1271
1272         /*
1273          * The boot idle thread does lazy MMU switching as well:
1274          */
1275         atomic_inc(&init_mm.mm_count);
1276         enter_lazy_tlb(&init_mm, current, cpu);
1277 }