kernel/sched.c

   1 /*
   2  *  linux/kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991, 1992  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
  13  */
  14
  15 /*
  16  * 'sched.c' is the main kernel file. It contains scheduling primitives
  17  * (sleep_on, wakeup, schedule etc) as well as a number of simple system
  18  * call functions (type getpid()), which just extract a field from
  19  * current-task
  20  */
  21
  22 #include <linux/config.h>
  23 #include <linux/mm.h>
  24 #include <linux/init.h>
  25 #include <linux/smp_lock.h>
  26 #include <linux/interrupt.h>
  27 #include <linux/kernel_stat.h>
  28
  29 #include <asm/uaccess.h>
  30 #include <asm/mmu_context.h>
  31
  32 extern void timer_bh(void);
  33 extern void tqueue_bh(void);
  34 extern void immediate_bh(void);
  35
  36 /*
  37  * scheduler variables
  38  */
  39
  40 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
  41
  42 extern void mem_use(void);
  43
  44 /*
  45  * Scheduling quanta.
  46  *
  47  * NOTE! The unix "nice" value influences how long a process
  48  * gets. The nice value ranges from -20 to +19, where a -20
  49  * is a "high-priority" task, and a "+10" is a low-priority
  50  * task.
  51  *
  52  * We want the time-slice to be around 50ms or so, so this
  53  * calculation depends on the value of HZ.
  54  */
  55 #if HZ < 200
  56 #define TICK_SCALE(x)   ((x) >> 2)
  57 #elif HZ < 400
  58 #define TICK_SCALE(x)   ((x) >> 1)
  59 #elif HZ < 800
  60 #define TICK_SCALE(x)   (x)
  61 #elif HZ < 1600
  62 #define TICK_SCALE(x)   ((x) << 1)
  63 #else
  64 #define TICK_SCALE(x)   ((x) << 2)
  65 #endif
  66
  67 #define NICE_TO_TICKS(nice)     (TICK_SCALE(20-(nice))+1)
  68
  69
  70 /*
  71  *      Init task must be ok at boot for the ix86 as we will check its signals
  72  *      via the SMP irq return path.
  73  */
  74
  75 struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
  76
  77 /*
  78  * The tasklist_lock protects the linked list of processes.
  79  *
  80  * The runqueue_lock locks the parts that actually access
  81  * and change the run-queues, and have to be interrupt-safe.
  82  *
  83  * If both locks are to be concurrently held, the runqueue_lock
  84  * nests inside the tasklist_lock.
  85  */
  86 spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
  87 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
  88
  89 static LIST_HEAD(runqueue_head);
  90
  91 /*
  92  * We align per-CPU scheduling data on cacheline boundaries,
  93  * to prevent cacheline ping-pong.
  94  */
  95 static union {
  96         struct schedule_data {
  97                 struct task_struct * curr;
  98                 cycles_t last_schedule;
  99         } schedule_data;
 100         char __pad [SMP_CACHE_BYTES];
 101 } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
 102
 103 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
 104 #define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
 105
 106 struct kernel_stat kstat;
 107
 108 #ifdef CONFIG_SMP
 109
 110 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
 111 #define can_schedule(p,cpu) ((!(p)->has_cpu) && \
 112                                 ((p)->cpus_allowed & (1 << cpu)))
 113
 114 #else
 115
 116 #define idle_task(cpu) (&init_task)
 117 #define can_schedule(p,cpu) (1)
 118
 119 #endif
 120
 121 void scheduling_functions_start_here(void) { }
 122
 123 /*
 124  * This is the function that decides how desirable a process is..
 125  * You can weigh different processes against each other depending
 126  * on what CPU they've run on lately etc to try to handle cache
 127  * and TLB miss penalties.
 128  *
 129  * Return values:
 130  *       -1000: never select this
 131  *           0: out of time, recalculate counters (but it might still be
 132  *              selected)
 133  *         +ve: "goodness" value (the larger, the better)
 134  *       +1000: realtime process, select this.
 135  */
 136
 137 static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
 138 {
 139         int weight;
 140
 141         /*
 142          * select the current process after every other
 143          * runnable process, but before the idle thread.
 144          * Also, dont trigger a counter recalculation.
 145          */
 146         weight = -1;
 147         if (p->policy & SCHED_YIELD)
 148                 goto out;
 149
 150         /*
 151          * Non-RT process - normal case first.
 152          */
 153         if (p->policy == SCHED_OTHER) {
 154                 /*
 155                  * Give the process a first-approximation goodness value
 156                  * according to the number of clock-ticks it has left.
 157                  *
 158                  * Don't do any other calculations if the time slice is
 159                  * over..
 160                  */
 161                 weight = p->counter;
 162                 if (!weight)
 163                         goto out;
 164
 165 #ifdef CONFIG_SMP
 166                 /* Give a largish advantage to the same processor...   */
 167                 /* (this is equivalent to penalizing other processors) */
 168                 if (p->processor == this_cpu)
 169                         weight += PROC_CHANGE_PENALTY;
 170 #endif
 171
 172                 /* .. and a slight advantage to the current MM */
 173                 if (p->mm == this_mm || !p->mm)
 174                         weight += 1;
 175                 weight += 20 - p->nice;
 176                 goto out;
 177         }
 178
 179         /*
 180          * Realtime process, select the first one on the
 181          * runqueue (taking priorities within processes
 182          * into account).
 183          */
 184         weight = 1000 + p->rt_priority;
 185 out:
 186         return weight;
 187 }
 188
 189 /*
 190  * the 'goodness value' of replacing a process on a given CPU.
 191  * positive value means 'replace', zero or negative means 'dont'.
 192  */
 193 static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
 194 {
 195         return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
 196 }
 197
 198 /*
 199  * This is ugly, but reschedule_idle() is very timing-critical.
 200  * We are called with the runqueue spinlock held and we must
 201  * not claim the tasklist_lock.
 202  */
 203 static FASTCALL(void reschedule_idle(struct task_struct * p));
 204
 205 static void reschedule_idle(struct task_struct * p)
 206 {
 207 #ifdef CONFIG_SMP
 208         int this_cpu = smp_processor_id();
 209         struct task_struct *tsk, *target_tsk;
 210         int cpu, best_cpu, i, max_prio;
 211         cycles_t oldest_idle;
 212
 213         /*
 214          * shortcut if the woken up task's last CPU is
 215          * idle now.
 216          */
 217         best_cpu = p->processor;
 218         if (can_schedule(p, best_cpu)) {
 219                 tsk = idle_task(best_cpu);
 220                 if (cpu_curr(best_cpu) == tsk) {
 221                         int need_resched;
 222 send_now_idle:
 223                         /*
 224                          * If need_resched == -1 then we can skip sending
 225                          * the IPI altogether, tsk->need_resched is
 226                          * actively watched by the idle thread.
 227                          */
 228                         need_resched = tsk->need_resched;
 229                         tsk->need_resched = 1;
 230                         if ((best_cpu != this_cpu) && !need_resched)
 231                                 smp_send_reschedule(best_cpu);
 232                         return;
 233                 }
 234         }
 235
 236         /*
 237          * We know that the preferred CPU has a cache-affine current
 238          * process, lets try to find a new idle CPU for the woken-up
 239          * process. Select the least recently active idle CPU. (that
 240          * one will have the least active cache context.) Also find
 241          * the executing process which has the least priority.
 242          */
 243         oldest_idle = (cycles_t) -1;
 244         target_tsk = NULL;
 245         max_prio = 1;
 246
 247         for (i = 0; i < smp_num_cpus; i++) {
 248                 cpu = cpu_logical_map(i);
 249                 if (!can_schedule(p, cpu))
 250                         continue;
 251                 tsk = cpu_curr(cpu);
 252                 /*
 253                  * We use the first available idle CPU. This creates
 254                  * a priority list between idle CPUs, but this is not
 255                  * a problem.
 256                  */
 257                 if (tsk == idle_task(cpu)) {
 258                         if (last_schedule(cpu) < oldest_idle) {
 259                                 oldest_idle = last_schedule(cpu);
 260                                 target_tsk = tsk;
 261                         }
 262                 } else {
 263                         if (oldest_idle == -1ULL) {
 264                                 int prio = preemption_goodness(tsk, p, cpu);
 265
 266                                 if (prio > max_prio) {
 267                                         max_prio = prio;
 268                                         target_tsk = tsk;
 269                                 }
 270                         }
 271                 }
 272         }
 273         tsk = target_tsk;
 274         if (tsk) {
 275                 if (oldest_idle != -1ULL) {
 276                         best_cpu = tsk->processor;
 277                         goto send_now_idle;
 278                 }
 279                 tsk->need_resched = 1;
 280                 if (tsk->processor != this_cpu)
 281                         smp_send_reschedule(tsk->processor);
 282         }
 283         return;
 284
 285
 286 #else /* UP */
 287         int this_cpu = smp_processor_id();
 288         struct task_struct *tsk;
 289
 290         tsk = cpu_curr(this_cpu);
 291         if (preemption_goodness(tsk, p, this_cpu) > 1)
 292                 tsk->need_resched = 1;
 293 #endif
 294 }
 295
 296 /*
 297  * Careful!
 298  *
 299  * This has to add the process to the _beginning_ of the
 300  * run-queue, not the end. See the comment about "This is
 301  * subtle" in the scheduler proper..
 302  */
 303 static inline void add_to_runqueue(struct task_struct * p)
 304 {
 305         list_add(&p->run_list, &runqueue_head);
 306         nr_running++;
 307 }
 308
 309 static inline void move_last_runqueue(struct task_struct * p)
 310 {
 311         list_del(&p->run_list);
 312         list_add_tail(&p->run_list, &runqueue_head);
 313 }
 314
 315 static inline void move_first_runqueue(struct task_struct * p)
 316 {
 317         list_del(&p->run_list);
 318         list_add(&p->run_list, &runqueue_head);
 319 }
 320
 321 /*
 322  * Wake up a process. Put it on the run-queue if it's not
 323  * already there.  The "current" process is always on the
 324  * run-queue (except when the actual re-schedule is in
 325  * progress), and as such you're allowed to do the simpler
 326  * "current->state = TASK_RUNNING" to mark yourself runnable
 327  * without the overhead of this.
 328  */
 329 inline void wake_up_process(struct task_struct * p)
 330 {
 331         unsigned long flags;
 332
 333         /*
 334          * We want the common case fall through straight, thus the goto.
 335          */
 336         spin_lock_irqsave(&runqueue_lock, flags);
 337         p->state = TASK_RUNNING;
 338         if (task_on_runqueue(p))
 339                 goto out;
 340         add_to_runqueue(p);
 341         reschedule_idle(p);
 342 out:
 343         spin_unlock_irqrestore(&runqueue_lock, flags);
 344 }
 345
 346 static inline void wake_up_process_synchronous(struct task_struct * p)
 347 {
 348         unsigned long flags;
 349
 350         /*
 351          * We want the common case fall through straight, thus the goto.
 352          */
 353         spin_lock_irqsave(&runqueue_lock, flags);
 354         p->state = TASK_RUNNING;
 355         if (task_on_runqueue(p))
 356                 goto out;
 357         add_to_runqueue(p);
 358 out:
 359         spin_unlock_irqrestore(&runqueue_lock, flags);
 360 }
 361
 362 static void process_timeout(unsigned long __data)
 363 {
 364         struct task_struct * p = (struct task_struct *) __data;
 365
 366         wake_up_process(p);
 367 }
 368
 369 signed long schedule_timeout(signed long timeout)
 370 {
 371         struct timer_list timer;
 372         unsigned long expire;
 373
 374         switch (timeout)
 375         {
 376         case MAX_SCHEDULE_TIMEOUT:
 377                 /*
 378                  * These two special cases are useful to be comfortable
 379                  * in the caller. Nothing more. We could take
 380                  * MAX_SCHEDULE_TIMEOUT from one of the negative value
 381                  * but I' d like to return a valid offset (>=0) to allow
 382                  * the caller to do everything it want with the retval.
 383                  */
 384                 schedule();
 385                 goto out;
 386         default:
 387                 /*
 388                  * Another bit of PARANOID. Note that the retval will be
 389                  * 0 since no piece of kernel is supposed to do a check
 390                  * for a negative retval of schedule_timeout() (since it
 391                  * should never happens anyway). You just have the printk()
 392                  * that will tell you if something is gone wrong and where.
 393                  */
 394                 if (timeout < 0)
 395                 {
 396                         printk(KERN_ERR "schedule_timeout: wrong timeout "
 397                                "value %lx from %p\n", timeout,
 398                                __builtin_return_address(0));
 399                         current->state = TASK_RUNNING;
 400                         goto out;
 401                 }
 402         }
 403
 404         expire = timeout + jiffies;
 405
 406         init_timer(&timer);
 407         timer.expires = expire;
 408         timer.data = (unsigned long) current;
 409         timer.function = process_timeout;
 410
 411         add_timer(&timer);
 412         schedule();
 413         del_timer_sync(&timer);
 414
 415         timeout = expire - jiffies;
 416
 417  out:
 418         return timeout < 0 ? 0 : timeout;
 419 }
 420
 421 /*
 422  * schedule_tail() is getting called from the fork return path. This
 423  * cleans up all remaining scheduler things, without impacting the
 424  * common case.
 425  */
 426 static inline void __schedule_tail(struct task_struct *prev)
 427 {
 428 #ifdef CONFIG_SMP
 429         int policy;
 430
 431         /*
 432          * prev->policy can be written from here only before `prev'
 433          * can be scheduled (before setting prev->has_cpu to zero).
 434          * Of course it must also be read before allowing prev
 435          * to be rescheduled, but since the write depends on the read
 436          * to complete, wmb() is enough. (the spin_lock() acquired
 437          * before setting has_cpu is not enough because the spin_lock()
 438          * common code semantics allows code outside the critical section
 439          * to enter inside the critical section)
 440          */
 441         policy = prev->policy;
 442         prev->policy = policy & ~SCHED_YIELD;
 443         wmb();
 444
 445         /*
 446          * fast path falls through. We have to clear has_cpu before
 447          * checking prev->state to avoid a wakeup race - thus we
 448          * also have to protect against the task exiting early.
 449          */
 450         task_lock(prev);
 451         prev->has_cpu = 0;
 452         mb();
 453         if (prev->state == TASK_RUNNING)
 454                 goto needs_resched;
 455
 456 out_unlock:
 457         task_unlock(prev);      /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
 458         return;
 459
 460         /*
 461          * Slow path - we 'push' the previous process and
 462          * reschedule_idle() will attempt to find a new
 463          * processor for it. (but it might preempt the
 464          * current process as well.) We must take the runqueue
 465          * lock and re-check prev->state to be correct. It might
 466          * still happen that this process has a preemption
 467          * 'in progress' already - but this is not a problem and
 468          * might happen in other circumstances as well.
 469          */
 470 needs_resched:
 471         {
 472                 unsigned long flags;
 473
 474                 /*
 475                  * Avoid taking the runqueue lock in cases where
 476                  * no preemption-check is necessery:
 477                  */
 478                 if ((prev == idle_task(smp_processor_id())) ||
 479                                                 (policy & SCHED_YIELD))
 480                         goto out_unlock;
 481
 482                 spin_lock_irqsave(&runqueue_lock, flags);
 483                 if (prev->state == TASK_RUNNING)
 484                         reschedule_idle(prev);
 485                 spin_unlock_irqrestore(&runqueue_lock, flags);
 486                 goto out_unlock;
 487         }
 488 #else
 489         prev->policy &= ~SCHED_YIELD;
 490 #endif /* CONFIG_SMP */
 491 }
 492
 493 void schedule_tail(struct task_struct *prev)
 494 {
 495         __schedule_tail(prev);
 496 }
 497
 498 /*
 499  *  'schedule()' is the scheduler function. It's a very simple and nice
 500  * scheduler: it's not perfect, but certainly works for most things.
 501  *
 502  * The goto is "interesting".
 503  *
 504  *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 505  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 506  * information in task[0] is never used.
 507  */
 508 asmlinkage void schedule(void)
 509 {
 510         struct schedule_data * sched_data;
 511         struct task_struct *prev, *next, *p;
 512         struct list_head *tmp;
 513         int this_cpu, c;
 514
 515         if (!current->active_mm) BUG();
 516 need_resched_back:
 517         prev = current;
 518         this_cpu = prev->processor;
 519
 520         if (in_interrupt())
 521                 goto scheduling_in_interrupt;
 522
 523         release_kernel_lock(prev, this_cpu);
 524
 525         /* Do "administrative" work here while we don't hold any locks */
 526         if (softirq_active(this_cpu) & softirq_mask(this_cpu))
 527                 goto handle_softirq;
 528 handle_softirq_back:
 529
 530         /*
 531          * 'sched_data' is protected by the fact that we can run
 532          * only one process per CPU.
 533          */
 534         sched_data = & aligned_data[this_cpu].schedule_data;
 535
 536         spin_lock_irq(&runqueue_lock);
 537
 538         /* move an exhausted RR process to be last.. */
 539         if (prev->policy == SCHED_RR)
 540                 goto move_rr_last;
 541 move_rr_back:
 542
 543         switch (prev->state) {
 544                 case TASK_INTERRUPTIBLE:
 545                         if (signal_pending(prev)) {
 546                                 prev->state = TASK_RUNNING;
 547                                 break;
 548                         }
 549                 default:
 550                         del_from_runqueue(prev);
 551                 case TASK_RUNNING:
 552         }
 553         prev->need_resched = 0;
 554
 555         /*
 556          * this is the scheduler proper:
 557          */
 558
 559 repeat_schedule:
 560         /*
 561          * Default process to select..
 562          */
 563         next = idle_task(this_cpu);
 564         c = -1000;
 565         if (prev->state == TASK_RUNNING)
 566                 goto still_running;
 567
 568 still_running_back:
 569         list_for_each(tmp, &runqueue_head) {
 570                 p = list_entry(tmp, struct task_struct, run_list);
 571                 if (can_schedule(p, this_cpu)) {
 572                         int weight = goodness(p, this_cpu, prev->active_mm);
 573                         if (weight > c)
 574                                 c = weight, next = p;
 575                 }
 576         }
 577
 578         /* Do we need to re-calculate counters? */
 579         if (!c)
 580                 goto recalculate;
 581         /*
 582          * from this point on nothing can prevent us from
 583          * switching to the next task, save this fact in
 584          * sched_data.
 585          */
 586         sched_data->curr = next;
 587 #ifdef CONFIG_SMP
 588         next->has_cpu = 1;
 589         next->processor = this_cpu;
 590 #endif
 591         spin_unlock_irq(&runqueue_lock);
 592
 593         if (prev == next)
 594                 goto same_process;
 595
 596 #ifdef CONFIG_SMP
 597         /*
 598          * maintain the per-process 'last schedule' value.
 599          * (this has to be recalculated even if we reschedule to
 600          * the same process) Currently this is only used on SMP,
 601          * and it's approximate, so we do not have to maintain
 602          * it while holding the runqueue spinlock.
 603          */
 604         sched_data->last_schedule = get_cycles();
 605
 606         /*
 607          * We drop the scheduler lock early (it's a global spinlock),
 608          * thus we have to lock the previous process from getting
 609          * rescheduled during switch_to().
 610          */
 611
 612 #endif /* CONFIG_SMP */
 613
 614         kstat.context_swtch++;
 615         /*
 616          * there are 3 processes which are affected by a context switch:
 617          *
 618          * prev == .... ==> (last => next)
 619          *
 620          * It's the 'much more previous' 'prev' that is on next's stack,
 621          * but prev is set to (the just run) 'last' process by switch_to().
 622          * This might sound slightly confusing but makes tons of sense.
 623          */
 624         prepare_to_switch();
 625         {
 626                 struct mm_struct *mm = next->mm;
 627                 struct mm_struct *oldmm = prev->active_mm;
 628                 if (!mm) {
 629                         if (next->active_mm) BUG();
 630                         next->active_mm = oldmm;
 631                         atomic_inc(&oldmm->mm_count);
 632                         enter_lazy_tlb(oldmm, next, this_cpu);
 633                 } else {
 634                         if (next->active_mm != mm) BUG();
 635                         switch_mm(oldmm, mm, next, this_cpu);
 636                 }
 637
 638                 if (!prev->mm) {
 639                         prev->active_mm = NULL;
 640                         mmdrop(oldmm);
 641                 }
 642         }
 643
 644         /*
 645          * This just switches the register state and the
 646          * stack.
 647          */
 648         switch_to(prev, next, prev);
 649         __schedule_tail(prev);
 650
 651 same_process:
 652         reacquire_kernel_lock(current);
 653         if (current->need_resched)
 654                 goto need_resched_back;
 655
 656         return;
 657
 658 recalculate:
 659         {
 660                 struct task_struct *p;
 661                 spin_unlock_irq(&runqueue_lock);
 662                 read_lock(&tasklist_lock);
 663                 for_each_task(p)
 664                         p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
 665                 read_unlock(&tasklist_lock);
 666                 spin_lock_irq(&runqueue_lock);
 667         }
 668         goto repeat_schedule;
 669
 670 still_running:
 671         c = goodness(prev, this_cpu, prev->active_mm);
 672         next = prev;
 673         goto still_running_back;
 674
 675 handle_softirq:
 676         do_softirq();
 677         goto handle_softirq_back;
 678
 679 move_rr_last:
 680         if (!prev->counter) {
 681                 prev->counter = NICE_TO_TICKS(prev->nice);
 682                 move_last_runqueue(prev);
 683         }
 684         goto move_rr_back;
 685
 686 scheduling_in_interrupt:
 687         printk("Scheduling in interrupt\n");
 688         BUG();
 689         return;
 690 }
 691
 692 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
 693                                      unsigned int wq_mode, const int sync)
 694 {
 695         struct list_head *tmp, *head;
 696         struct task_struct *p, *best_exclusive;
 697         unsigned long flags;
 698         int best_cpu, irq;
 699
 700         if (!q)
 701                 goto out;
 702
 703         best_cpu = smp_processor_id();
 704         irq = in_interrupt();
 705         best_exclusive = NULL;
 706         wq_write_lock_irqsave(&q->lock, flags);
 707
 708 #if WAITQUEUE_DEBUG
 709         CHECK_MAGIC_WQHEAD(q);
 710 #endif
 711
 712         head = &q->task_list;
 713 #if WAITQUEUE_DEBUG
 714         if (!head->next || !head->prev)
 715                 WQ_BUG();
 716 #endif
 717         tmp = head->next;
 718         while (tmp != head) {
 719                 unsigned int state;
 720                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
 721
 722                 tmp = tmp->next;
 723
 724 #if WAITQUEUE_DEBUG
 725                 CHECK_MAGIC(curr->__magic);
 726 #endif
 727                 p = curr->task;
 728                 state = p->state;
 729                 if (state & mode) {
 730 #if WAITQUEUE_DEBUG
 731                         curr->__waker = (long)__builtin_return_address(0);
 732 #endif
 733                         /*
 734                          * If waking up from an interrupt context then
 735                          * prefer processes which are affine to this
 736                          * CPU.
 737                          */
 738                         if (irq && (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)) {
 739                                 if (!best_exclusive)
 740                                         best_exclusive = p;
 741                                 if (p->processor == best_cpu) {
 742                                         best_exclusive = p;
 743                                         break;
 744                                 }
 745                         } else {
 746                                 if (sync)
 747                                         wake_up_process_synchronous(p);
 748                                 else
 749                                         wake_up_process(p);
 750                                 if (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)
 751                                         break;
 752                         }
 753                 }
 754         }
 755         if (best_exclusive) {
 756                 if (sync)
 757                         wake_up_process_synchronous(best_exclusive);
 758                 else
 759                         wake_up_process(best_exclusive);
 760         }
 761         wq_write_unlock_irqrestore(&q->lock, flags);
 762 out:
 763         return;
 764 }
 765
 766 void __wake_up(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
 767 {
 768         __wake_up_common(q, mode, wq_mode, 0);
 769 }
 770
 771 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
 772 {
 773         __wake_up_common(q, mode, wq_mode, 1);
 774 }
 775
 776 #define SLEEP_ON_VAR                            \
 777         unsigned long flags;                    \
 778         wait_queue_t wait;                      \
 779         init_waitqueue_entry(&wait, current);
 780
 781 #define SLEEP_ON_HEAD                                   \
 782         wq_write_lock_irqsave(&q->lock,flags);          \
 783         __add_wait_queue(q, &wait);                     \
 784         wq_write_unlock(&q->lock);
 785
 786 #define SLEEP_ON_TAIL                                           \
 787         wq_write_lock_irq(&q->lock);                            \
 788         __remove_wait_queue(q, &wait);                          \
 789         wq_write_unlock_irqrestore(&q->lock,flags);
 790
 791 void interruptible_sleep_on(wait_queue_head_t *q)
 792 {
 793         SLEEP_ON_VAR
 794
 795         current->state = TASK_INTERRUPTIBLE;
 796
 797         SLEEP_ON_HEAD
 798         schedule();
 799         SLEEP_ON_TAIL
 800 }
 801
 802 long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 803 {
 804         SLEEP_ON_VAR
 805
 806         current->state = TASK_INTERRUPTIBLE;
 807
 808         SLEEP_ON_HEAD
 809         timeout = schedule_timeout(timeout);
 810         SLEEP_ON_TAIL
 811
 812         return timeout;
 813 }
 814
 815 void sleep_on(wait_queue_head_t *q)
 816 {
 817         SLEEP_ON_VAR
 818
 819         current->state = TASK_UNINTERRUPTIBLE;
 820
 821         SLEEP_ON_HEAD
 822         schedule();
 823         SLEEP_ON_TAIL
 824 }
 825
 826 long sleep_on_timeout(wait_queue_head_t *q, long timeout)
 827 {
 828         SLEEP_ON_VAR
 829
 830         current->state = TASK_UNINTERRUPTIBLE;
 831
 832         SLEEP_ON_HEAD
 833         timeout = schedule_timeout(timeout);
 834         SLEEP_ON_TAIL
 835
 836         return timeout;
 837 }
 838
 839 void scheduling_functions_end_here(void) { }
 840
 841 #ifndef __alpha__
 842
 843 /*
 844  * This has been replaced by sys_setpriority.  Maybe it should be
 845  * moved into the arch dependent tree for those ports that require
 846  * it for backward compatibility?
 847  */
 848
 849 asmlinkage long sys_nice(int increment)
 850 {
 851         long newprio;
 852
 853         /*
 854          *      Setpriority might change our priority at the same moment.
 855          *      We don't have to worry. Conceptually one call occurs first
 856          *      and we have a single winner.
 857          */
 858         if (increment < 0) {
 859                 if (!capable(CAP_SYS_NICE))
 860                         return -EPERM;
 861                 if (increment < -40)
 862                         increment = -40;
 863         }
 864         if (increment > 40)
 865                 increment = 40;
 866
 867         newprio = current->nice + increment;
 868         if (newprio < -20)
 869                 newprio = -20;
 870         if (newprio > 19)
 871                 newprio = 19;
 872         current->nice = newprio;
 873         return 0;
 874 }
 875
 876 #endif
 877
 878 static inline struct task_struct *find_process_by_pid(pid_t pid)
 879 {
 880         struct task_struct *tsk = current;
 881
 882         if (pid)
 883                 tsk = find_task_by_pid(pid);
 884         return tsk;
 885 }
 886
 887 static int setscheduler(pid_t pid, int policy,
 888                         struct sched_param *param)
 889 {
 890         struct sched_param lp;
 891         struct task_struct *p;
 892         int retval;
 893
 894         retval = -EINVAL;
 895         if (!param || pid < 0)
 896                 goto out_nounlock;
 897
 898         retval = -EFAULT;
 899         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
 900                 goto out_nounlock;
 901
 902         /*
 903          * We play safe to avoid deadlocks.
 904          */
 905         read_lock_irq(&tasklist_lock);
 906         spin_lock(&runqueue_lock);
 907
 908         p = find_process_by_pid(pid);
 909
 910         retval = -ESRCH;
 911         if (!p)
 912                 goto out_unlock;
 913
 914         if (policy < 0)
 915                 policy = p->policy;
 916         else {
 917                 retval = -EINVAL;
 918                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
 919                                 policy != SCHED_OTHER)
 920                         goto out_unlock;
 921         }
 922
 923         /*
 924          * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
 925          * priority for SCHED_OTHER is 0.
 926          */
 927         retval = -EINVAL;
 928         if (lp.sched_priority < 0 || lp.sched_priority > 99)
 929                 goto out_unlock;
 930         if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
 931                 goto out_unlock;
 932
 933         retval = -EPERM;
 934         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
 935             !capable(CAP_SYS_NICE))
 936                 goto out_unlock;
 937         if ((current->euid != p->euid) && (current->euid != p->uid) &&
 938             !capable(CAP_SYS_NICE))
 939                 goto out_unlock;
 940
 941         retval = 0;
 942         p->policy = policy;
 943         p->rt_priority = lp.sched_priority;
 944         if (task_on_runqueue(p))
 945                 move_first_runqueue(p);
 946
 947         current->need_resched = 1;
 948
 949 out_unlock:
 950         spin_unlock(&runqueue_lock);
 951         read_unlock_irq(&tasklist_lock);
 952
 953 out_nounlock:
 954         return retval;
 955 }
 956
 957 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 958                                       struct sched_param *param)
 959 {
 960         return setscheduler(pid, policy, param);
 961 }
 962
 963 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param)
 964 {
 965         return setscheduler(pid, -1, param);
 966 }
 967
 968 asmlinkage long sys_sched_getscheduler(pid_t pid)
 969 {
 970         struct task_struct *p;
 971         int retval;
 972
 973         retval = -EINVAL;
 974         if (pid < 0)
 975                 goto out_nounlock;
 976
 977         retval = -ESRCH;
 978         read_lock(&tasklist_lock);
 979         p = find_process_by_pid(pid);
 980         if (p)
 981                 retval = p->policy & ~SCHED_YIELD;
 982         read_unlock(&tasklist_lock);
 983
 984 out_nounlock:
 985         return retval;
 986 }
 987
 988 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
 989 {
 990         struct task_struct *p;
 991         struct sched_param lp;
 992         int retval;
 993
 994         retval = -EINVAL;
 995         if (!param || pid < 0)
 996                 goto out_nounlock;
 997
 998         read_lock(&tasklist_lock);
 999         p = find_process_by_pid(pid);
1000         retval = -ESRCH;
1001         if (!p)
1002                 goto out_unlock;
1003         lp.sched_priority = p->rt_priority;
1004         read_unlock(&tasklist_lock);
1005
1006         /*
1007          * This one might sleep, we cannot do it with a spinlock held ...
1008          */
1009         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1010
1011 out_nounlock:
1012         return retval;
1013
1014 out_unlock:
1015         read_unlock(&tasklist_lock);
1016         return retval;
1017 }
1018
1019 asmlinkage long sys_sched_yield(void)
1020 {
1021         /*
1022          * Trick. sched_yield() first counts the number of truly
1023          * 'pending' runnable processes, then returns if it's
1024          * only the current processes. (This test does not have
1025          * to be atomic.) In threaded applications this optimization
1026          * gets triggered quite often.
1027          */
1028
1029         int nr_pending = nr_running;
1030
1031 #if CONFIG_SMP
1032         int i;
1033
1034         // Substract non-idle processes running on other CPUs.
1035         for (i = 0; i < smp_num_cpus; i++)
1036                 if (aligned_data[i].schedule_data.curr != idle_task(i))
1037                         nr_pending--;
1038 #else
1039         // on UP this process is on the runqueue as well
1040         nr_pending--;
1041 #endif
1042         if (nr_pending) {
1043                 /*
1044                  * This process can only be rescheduled by us,
1045                  * so this is safe without any locking.
1046                  */
1047                 if (current->policy == SCHED_OTHER)
1048                         current->policy |= SCHED_YIELD;
1049                 current->need_resched = 1;
1050         }
1051         return 0;
1052 }
1053
1054 asmlinkage long sys_sched_get_priority_max(int policy)
1055 {
1056         int ret = -EINVAL;
1057
1058         switch (policy) {
1059         case SCHED_FIFO:
1060         case SCHED_RR:
1061                 ret = 99;
1062                 break;
1063         case SCHED_OTHER:
1064                 ret = 0;
1065                 break;
1066         }
1067         return ret;
1068 }
1069
1070 asmlinkage long sys_sched_get_priority_min(int policy)
1071 {
1072         int ret = -EINVAL;
1073
1074         switch (policy) {
1075         case SCHED_FIFO:
1076         case SCHED_RR:
1077                 ret = 1;
1078                 break;
1079         case SCHED_OTHER:
1080                 ret = 0;
1081         }
1082         return ret;
1083 }
1084
1085 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1086 {
1087         struct timespec t;
1088         struct task_struct *p;
1089         int retval = -EINVAL;
1090
1091         if (pid < 0)
1092                 goto out_nounlock;
1093
1094         retval = -ESRCH;
1095         read_lock(&tasklist_lock);
1096         p = find_process_by_pid(pid);
1097         if (p)
1098                 jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
1099                                     &t);
1100         read_unlock(&tasklist_lock);
1101         if (p)
1102                 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1103 out_nounlock:
1104         return retval;
1105 }
1106
1107 static void show_task(struct task_struct * p)
1108 {
1109         unsigned long free = 0;
1110         int state;
1111         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1112
1113         printk("%-8s  ", p->comm);
1114         state = p->state ? ffz(~p->state) + 1 : 0;
1115         if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
1116                 printk(stat_nam[state]);
1117         else
1118                 printk(" ");
1119 #if (BITS_PER_LONG == 32)
1120         if (p == current)
1121                 printk(" current  ");
1122         else
1123                 printk(" %08lX ", thread_saved_pc(&p->thread));
1124 #else
1125         if (p == current)
1126                 printk("   current task   ");
1127         else
1128                 printk(" %016lx ", thread_saved_pc(&p->thread));
1129 #endif
1130         {
1131                 unsigned long * n = (unsigned long *) (p+1);
1132                 while (!*n)
1133                         n++;
1134                 free = (unsigned long) n - (unsigned long)(p+1);
1135         }
1136         printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
1137         if (p->p_cptr)
1138                 printk("%5d ", p->p_cptr->pid);
1139         else
1140                 printk("      ");
1141         if (!p->mm)
1142                 printk(" (L-TLB) ");
1143         else
1144                 printk(" (NOTLB) ");
1145         if (p->p_ysptr)
1146                 printk("%7d", p->p_ysptr->pid);
1147         else
1148                 printk("       ");
1149         if (p->p_osptr)
1150                 printk(" %5d\n", p->p_osptr->pid);
1151         else
1152                 printk("\n");
1153
1154         {
1155                 struct sigqueue *q;
1156                 char s[sizeof(sigset_t)*2+1], b[sizeof(sigset_t)*2+1];
1157
1158                 render_sigset_t(&p->pending.signal, s);
1159                 render_sigset_t(&p->blocked, b);
1160                 printk("   sig: %d %s %s :", signal_pending(p), s, b);
1161                 for (q = p->pending.head; q ; q = q->next)
1162                         printk(" %d", q->info.si_signo);
1163                 printk(" X\n");
1164         }
1165 }
1166
1167 char * render_sigset_t(sigset_t *set, char *buffer)
1168 {
1169         int i = _NSIG, x;
1170         do {
1171                 i -= 4, x = 0;
1172                 if (sigismember(set, i+1)) x |= 1;
1173                 if (sigismember(set, i+2)) x |= 2;
1174                 if (sigismember(set, i+3)) x |= 4;
1175                 if (sigismember(set, i+4)) x |= 8;
1176                 *buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
1177         } while (i >= 4);
1178         *buffer = 0;
1179         return buffer;
1180 }
1181
1182 void show_state(void)
1183 {
1184         struct task_struct *p;
1185
1186 #if (BITS_PER_LONG == 32)
1187         printk("\n"
1188                "                         free                        sibling\n");
1189         printk("  task             PC    stack   pid father child younger older\n");
1190 #else
1191         printk("\n"
1192                "                                 free                        sibling\n");
1193         printk("  task                 PC        stack   pid father child younger older\n");
1194 #endif
1195         read_lock(&tasklist_lock);
1196         for_each_task(p)
1197                 show_task(p);
1198         read_unlock(&tasklist_lock);
1199 }
1200
1201 /*
1202  *      Put all the gunge required to become a kernel thread without
1203  *      attached user resources in one place where it belongs.
1204  */
1205
1206 void daemonize(void)
1207 {
1208         struct fs_struct *fs;
1209
1210
1211         /*
1212          * If we were started as result of loading a module, close all of the
1213          * user space pages.  We don't need them, and if we didn't close them
1214          * they would be locked into memory.
1215          */
1216         exit_mm(current);
1217
1218         current->session = 1;
1219         current->pgrp = 1;
1220
1221         /* Become as one with the init task */
1222
1223         exit_fs(current);       /* current->fs->count--; */
1224         fs = init_task.fs;
1225         current->fs = fs;
1226         atomic_inc(&fs->count);
1227         exit_files(current);
1228         current->files = init_task.files;
1229         atomic_inc(&current->files->count);
1230 }
1231
1232 void __init init_idle(void)
1233 {
1234         struct schedule_data * sched_data;
1235         sched_data = &aligned_data[smp_processor_id()].schedule_data;
1236
1237         if (current != &init_task && task_on_runqueue(current)) {
1238                 printk("UGH! (%d:%d) was on the runqueue, removing.\n",
1239                         smp_processor_id(), current->pid);
1240                 del_from_runqueue(current);
1241         }
1242         sched_data->curr = current;
1243         sched_data->last_schedule = get_cycles();
1244 }
1245
1246 extern void init_timervecs (void);
1247
1248 void __init sched_init(void)
1249 {
1250         /*
1251          * We have to do a little magic to get the first
1252          * process right in SMP mode.
1253          */
1254         int cpu = smp_processor_id();
1255         int nr;
1256
1257         init_task.processor = cpu;
1258
1259         for(nr = 0; nr < PIDHASH_SZ; nr++)
1260                 pidhash[nr] = NULL;
1261
1262         init_timervecs();
1263
1264         init_bh(TIMER_BH, timer_bh);
1265         init_bh(TQUEUE_BH, tqueue_bh);
1266         init_bh(IMMEDIATE_BH, immediate_bh);
1267
1268         /*
1269          * The boot idle thread does lazy MMU switching as well:
1270          */
1271         atomic_inc(&init_mm.mm_count);
1272         enter_lazy_tlb(&init_mm, current, cpu);
1273 }